Summarise Over All Columns

Summarizing multiple columns with dplyr?

In dplyr (>=1.00) you may use across(everything() in summarise to apply a function to all variables:

library(dplyr)

df %>% group_by(grp) %>% summarise(across(everything(), list(mean)))
#> # A tibble: 3 x 5
#> grp a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 3.08 2.98 2.98 2.91
#> 2 2 3.03 3.04 2.97 2.87
#> 3 3 2.85 2.95 2.95 3.06

Alternatively, the purrrlyr package provides the same functionality:

library(purrrlyr)
df %>% slice_rows("grp") %>% dmap(mean)
#> # A tibble: 3 x 5
#> grp a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 3.08 2.98 2.98 2.91
#> 2 2 3.03 3.04 2.97 2.87
#> 3 3 2.85 2.95 2.95 3.06

Also don't forget about data.table (use keyby to sort sort groups):

library(data.table)
setDT(df)[, lapply(.SD, mean), keyby = grp]
#> grp a b c d
#> 1: 1 3.079412 2.979412 2.979412 2.914706
#> 2: 2 3.029126 3.038835 2.967638 2.873786
#> 3: 3 2.854701 2.948718 2.951567 3.062678

Let's try to compare performance.

library(dplyr)
library(purrrlyr)
library(data.table)
library(bench)
set.seed(123)
n <- 10000
df <- data.frame(
a = sample(1:5, n, replace = TRUE),
b = sample(1:5, n, replace = TRUE),
c = sample(1:5, n, replace = TRUE),
d = sample(1:5, n, replace = TRUE),
grp = sample(1:3, n, replace = TRUE)
)
dt <- setDT(df)
mark(
dplyr = df %>% group_by(grp) %>% summarise(across(everything(), list(mean))),
purrrlyr = df %>% slice_rows("grp") %>% dmap(mean),
data.table = dt[, lapply(.SD, mean), keyby = grp],
check = FALSE
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 2.81ms 2.85ms 328. NA 17.3
#> 2 purrrlyr 7.96ms 8.04ms 123. NA 24.5
#> 3 data.table 596.33µs 707.91µs 1409. NA 10.3

How to summarise all columns using group_by and summarise?

It's hard to try and answer your question without a better example (ie, you can dput() your data to give us a sample). But here is a solution to your last issue: "For the first problem, I expect to get a table with the sum of repeated rows for all columns. Moreover, if it was possible, I would expect to get a better code for the sum of different activities on Saturday."

# create toy data of 3 different IDs, 3 different types, and repeated days
df <- data.frame(id=sample(c(1:3),100,T),
type=sample(letters[1:3],100,T),
day=sample(c(1:7),100,T),
matrix(runif(300),nrow=100),
stringsAsFactors = F)

# gather data, summarize each activity column by ID, type and day
# and select Saturday==6
df %>% gather(k,v,-id,-type,-day) %>%
group_by(id,type,day,k) %>%
summarise(sum=sum(v)) %>%
filter(day==6) %>%
spread(k,sum)

# A tibble: 8 x 6
# Groups: id, type, day [8]
id type day X1 X2 X3
<int> <chr> <int> <dbl> <dbl> <dbl>
1 1 a 6 1.85 3.26 2.09
2 1 b 6 0.604 0.583 0.586
3 1 c 6 0.163 0.663 0.624
4 2 a 6 0.185 0.952 0.349
5 2 b 6 1.16 0.832 0.974
6 2 c 6 0.906 1.62 0.853
7 3 b 6 0.671 1.39 0.887
8 3 c 6 0.449 0.150 0.647

UPDATE
Here is an updated solution with the new data provided.

df %>% group_by(LbNr,Type,Weekday) %>% summarise_all(.,sum)

# A tibble: 20 x 14
# Groups: LbNr, Type [5]
LbNr Type Weekday Time lie sit stand move walk run stairs cycle
<dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22002 A1. ~ 1 6.33 0.386 4.52e+0 0.726 0.499 0.189 0.00111 0.0075 0.00556
2 22002 A1. ~ 2 7.9 0.766 4.74e+0 1.28 0.611 0.489 0.00194 0.0111 0
3 22002 A1. ~ 3 7.33 0.262 3.63e+0 2.04 0.941 0.449 0.00083 0.0114 0
4 22002 A1. ~ 4 11.7 0.761 5.91e+0 2.54 1.19 1.25 0.00416 0.0394 0.00778
5 22002 A1. ~ 5 6.57 0.140 4.51e+0 1.12 0.51 0.254 0.00139 0.0183 0.01
6 22002 A1. ~ 6 0.433 0.0169 3.02e-1 0.0589 0.0378 0.0175 0 0 0
7 22002 A2. ~ 1 7.5 0.0792 5.90e+0 0.546 0.326 0.611 0.00111 0.0392 0
8 22002 A2. ~ 2 9.83 0.0597 6.64e+0 1.64 0.595 0.842 0.00167 0.0575 0
9 22002 A2. ~ 3 9.83 0.653 5.79e+0 1.82 0.525 1.01 0.00083 0.0333 0
10 22002 A2. ~ 4 5 0.383 2.80e+0 0.886 0.392 0.514 0.0025 0.0247 0
11 22002 A2. ~ 5 11.0 0.0103 6.77e+0 1.83 1.05 1.29 0.00472 0.0672 0
12 22002 A4. ~ 2 6.27 4.86 1.41e+0 0 0 0 0 0 0
13 22002 A4. ~ 3 6.83 5.69 1.15e+0 0 0 0 0 0 0
14 22002 A4. ~ 4 7.3 7.28 4.72e-3 0.00667 0.00667 0 0 0.00194 0
15 22002 A4. ~ 5 6.42 5.49 9.30e-1 0 0 0 0 0 0
16 22002 C0. ~ 6 15.7 0.245 9.78e+0 2.34 2.45 0.800 0.00194 0.0581 0
17 22002 C0. ~ 7 15.6 0.122 1.20e+1 1.80 0.940 0.656 0.0869 0.0164 0
18 22002 C4. ~ 1 6.33 5.75 5.84e-1 0 0 0 0 0 0
19 22002 C4. ~ 6 7.9 6.96 9.22e-1 0.00667 0.00806 0.00306 0 0 0
20 22002 C4. ~ 7 8.35 7.36 9.33e-1 0.0364 0.0208 0.00472 0 0 0
# ... with 2 more variables: WalkSlow <dbl>, WalkFast <dbl>

I think this answers your first question about wanting a 'small code'. I don't understand your second question still about "I would expect to get a better code for the sum of different activities on Saturday." Does this mean that you want to sum across the different activities (lie, sit, etc.) for Saturday only? Or do you want to sum across different types (A2, C0, etc) of activities?

df %>% group_by(LbNr,Type,Weekday) %>% summarise_all(.,sum) %>% 
filter(Weekday==6)

# A tibble: 3 x 14
# Groups: LbNr, Type [3]
LbNr Type Weekday Time lie sit stand move walk run stairs cycle WalkSlow
<dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22002 A1. ~ 6 0.433 0.0169 0.302 0.0589 0.0378 0.0175 0 0 0 0.00417
2 22002 C0. ~ 6 15.7 0.245 9.78 2.34 2.45 0.800 0.00194 0.0581 0 0.14
3 22002 C4. ~ 6 7.9 6.96 0.922 0.00667 0.00806 0.00306 0 0 0 0
# ... with 1 more variable: WalkFast <dbl>

# summarise across different activities, for each column, on Saturday only
df %>% group_by(LbNr,Type,Weekday) %>% summarise_all(.,sum) %>%
filter(Weekday==6) %>% group_by(LbNr) %>% select(-Type,-Weekday) %>%
summarise_all(.,sum)

# A tibble: 1 x 12
LbNr Time lie sit stand move walk run stairs cycle WalkSlow WalkFast
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 22002 24 7.22 11.0 2.41 2.49 0.820 0.00194 0.0581 0 0.144 0.670

How do I summarise all columns except one(s) I specify?

Edit:

Modified versions of the two methods below for dplyr version >= 1, since summarise_at is superseded

df %>% 
summarise(across(where(is.numeric) & !Registered, sum))

df %>%
summarise(across(-Registered, sum))

Original Answer:

I would use summarise_at, and just make a logical vector which is FALSE for non-numeric columns and Registered and TRUE otherwise, i.e.

df %>% 
summarise_at(which(sapply(df, is.numeric) & names(df) != 'Registered'), sum)

If you wanted to just summarise all but one column you could do

df %>% 
summarise_at(vars(-Registered), sum)

but in this case you have to check if it's numeric also.

Notes:

  • factors are technically numeric, so if you want to exclude non-numeric columns and factors, replace sapply(df, is.numeric) with sapply(df, function(x) is.numeric(x) & !is.factor(x))

  • If your data is big I think it is faster to use sapply(df[1,], is.numeric) instead of sapply(df, is.numeric). (Someone please correct me if I'm wrong)

Group by two column and summarize multiple columns

We can use summarise with across from dplyr version > = 1.00

library(dplyr)
df %>%
group_by(State, Date) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
# A tibble: 6 x 4
# State Date Female Male
# <chr> <chr> <int> <int>
#1 Cali 05/06/2005 3 2
#2 Cali 10/06/2005 4 3
#3 NY 11/06/2005 10 5
#4 NY 12/06/2005 11 6
#5 Texas 01/01/2004 5 3
#6 Texas 02/01/2004 5 4

Or using aggregate from base R

aggregate(.~ State + Date, df, sum, na.rm = TRUE)

data

df <-  structure(list(State = c("Texas", "Texas", "Texas", "Cali", "Cali", 
"Cali", "Cali", "NY", "NY"), Female = c(2L, 3L, 5L, 1L, 2L, 3L,
1L, 10L, 11L), Male = c(2L, 1L, 4L, 1L, 1L, 1L, 2L, 5L, 6L),
Date = c("01/01/2004", "01/01/2004", "02/01/2004", "05/06/2005",
"05/06/2005", "10/06/2005", "10/06/2005", "11/06/2005", "12/06/2005"
)), class = "data.frame", row.names = c(NA, -9L))

Summarise multiple columns using dplyr R

Try this:

df %>%
group_by(County) %>%
summarise(across(c(Submissions, Population), sum))

tidyverse summarize multiple columns but show result as rows

You can skip the pivot_wider step by using ".value" in names_to.

library(dplyr)

dat %>%
summarise_all(list(mean = mean,sum = sum)) %>%
tidyr::pivot_longer(cols = everything(),
names_sep = "_",
names_to = c("variable", ".value"))

# A tibble: 5 x 3
# variable mean sum
# <chr> <dbl> <int>
#1 V1 10.5 210
#2 V2 30.5 610
#3 V3 50.5 1010
#4 V4 70.5 1410
#5 V5 90.5 1810

Summarise multiple columns that have to be grouped tidyverse

Get the data in long format and count :

library(dplyr)
library(tidyr)

df %>% pivot_longer(cols = one:three) %>% count(group1, group2, value)

# group1 group2 value n
# <chr> <chr> <chr> <int>
#1 High female no 1
#2 High female yes 2
#3 High male no 3
#4 High male yes 3
#5 Low female no 2
#6 Low female yes 4
#7 Low male no 1
#8 Low male yes 2


Related Topics



Leave a reply



Submit