Summarizing Multiple Columns With Dplyr

Summarizing multiple columns with dplyr?

In dplyr (>=1.00) you may use across(everything() in summarise to apply a function to all variables:

library(dplyr)

df %>% group_by(grp) %>% summarise(across(everything(), list(mean)))
#> # A tibble: 3 x 5
#> grp a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 3.08 2.98 2.98 2.91
#> 2 2 3.03 3.04 2.97 2.87
#> 3 3 2.85 2.95 2.95 3.06

Alternatively, the purrrlyr package provides the same functionality:

library(purrrlyr)
df %>% slice_rows("grp") %>% dmap(mean)
#> # A tibble: 3 x 5
#> grp a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 3.08 2.98 2.98 2.91
#> 2 2 3.03 3.04 2.97 2.87
#> 3 3 2.85 2.95 2.95 3.06

Also don't forget about data.table (use keyby to sort sort groups):

library(data.table)
setDT(df)[, lapply(.SD, mean), keyby = grp]
#> grp a b c d
#> 1: 1 3.079412 2.979412 2.979412 2.914706
#> 2: 2 3.029126 3.038835 2.967638 2.873786
#> 3: 3 2.854701 2.948718 2.951567 3.062678

Let's try to compare performance.

library(dplyr)
library(purrrlyr)
library(data.table)
library(bench)
set.seed(123)
n <- 10000
df <- data.frame(
a = sample(1:5, n, replace = TRUE),
b = sample(1:5, n, replace = TRUE),
c = sample(1:5, n, replace = TRUE),
d = sample(1:5, n, replace = TRUE),
grp = sample(1:3, n, replace = TRUE)
)
dt <- setDT(df)
mark(
dplyr = df %>% group_by(grp) %>% summarise(across(everything(), list(mean))),
purrrlyr = df %>% slice_rows("grp") %>% dmap(mean),
data.table = dt[, lapply(.SD, mean), keyby = grp],
check = FALSE
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 2.81ms 2.85ms 328. NA 17.3
#> 2 purrrlyr 7.96ms 8.04ms 123. NA 24.5
#> 3 data.table 596.33µs 707.91µs 1409. NA 10.3

Summarise multiple columns using dplyr R

Try this:

df %>%
group_by(County) %>%
summarise(across(c(Submissions, Population), sum))

Group by two column and summarize multiple columns

We can use summarise with across from dplyr version > = 1.00

library(dplyr)
df %>%
group_by(State, Date) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
# A tibble: 6 x 4
# State Date Female Male
# <chr> <chr> <int> <int>
#1 Cali 05/06/2005 3 2
#2 Cali 10/06/2005 4 3
#3 NY 11/06/2005 10 5
#4 NY 12/06/2005 11 6
#5 Texas 01/01/2004 5 3
#6 Texas 02/01/2004 5 4

Or using aggregate from base R

aggregate(.~ State + Date, df, sum, na.rm = TRUE)

data

df <-  structure(list(State = c("Texas", "Texas", "Texas", "Cali", "Cali", 
"Cali", "Cali", "NY", "NY"), Female = c(2L, 3L, 5L, 1L, 2L, 3L,
1L, 10L, 11L), Male = c(2L, 1L, 4L, 1L, 1L, 1L, 2L, 5L, 6L),
Date = c("01/01/2004", "01/01/2004", "02/01/2004", "05/06/2005",
"05/06/2005", "10/06/2005", "10/06/2005", "11/06/2005", "12/06/2005"
)), class = "data.frame", row.names = c(NA, -9L))

Summarise multiple columns in R using `case_when` and %in%

Try this:

func <- function(..., values) {
mtx <- do.call(cbind, list(...))
mtx <- array(mtx %in% values, dim = dim(mtx))
rowSums(mtx) > 0
}

data %>%
mutate(dogs = case_when(
func(labrador, beagle, corgi, values = c("Daily", "Weekly", "Monthly")) ~ "Regularly",
func(labrador, beagle, corgi, values = c("Rarely")) ~ "Rarely",
func(labrador, beagle, corgi, values = c("Never")) ~ "Never" ),
birds = case_when(
func(pigeon, sparrow, robin, values = c("Daily", "Weekly", "Monthly")) ~ "Regularly",
func(pigeon, sparrow, robin, values = c("Rarely")) ~ "Rarely",
func(pigeon, sparrow, robin, values = c("Never")) ~ "Never" )
)
# # A tibble: 3 x 8
# labrador beagle corgi pigeon sparrow robin dogs birds
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 Weekly Rarely Never Rarely Never Rarely Regularly Rarely
# 2 Never Never Monthly Never Never Never Regularly Never
# 3 Rarely Never Never Weekly Never Daily Rarely Regularly

An alternative that does not require func, instead double-pivoting.

library(tidyr) # pivot_*
data <- mutate(data, rn = row_number())
data %>%
pivot_longer(-rn) %>%
mutate(species = case_when(name %in% c("labrador", "beagle", "corgi") ~ "dogs", name %in% c("pigeon", "sparrow", "robin") ~ "birds", TRUE ~ "other")) %>%
group_by(rn, species) %>%
summarize(total = case_when(any(value %in% c("Daily", "Weekly", "Monthly")) ~ "Regularly", any(value %in% c("Rarely")) ~ "Rarely", any(value %in% c("Never")) ~ "Never", TRUE ~ "unk")) %>%
ungroup() %>%
pivot_wider(rn, names_from = species, values_from = total) %>%
left_join(data, ., by = "rn")
# # A tibble: 3 x 9
# labrador beagle corgi pigeon sparrow robin rn birds dogs
# <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr> <chr>
# 1 Weekly Rarely Never Rarely Never Rarely 1 Rarely Regularly
# 2 Never Never Monthly Never Never Never 2 Never Regularly
# 3 Rarely Never Never Weekly Never Daily 3 Regularly Rarely

Using dplyr summarize with different operations for multiple columns

As other people have mentioned, this is normally done by calling summarize_each / summarize_at / summarize_if for every group of columns that you want to apply the summarizing function to. As far as I know, you would have to create a custom function that performs summarizations to each subset. You can for example set the colnames in such way that you can use the select helpers (e.g. contains()) to filter just the columns that you want to apply the function to. If not, then you can set the specific column numbers that you want to summarize.

For the example you mentioned, you could try the following:

summarizer <- function(tb, colsone, colstwo, colsthree, 
funsone, funstwo, funsthree, group_name) {

return(bind_cols(
summarize_all(select(tb, colsone), .funs = funsone),
summarize_all(select(tb, colstwo), .funs = funstwo) %>%
ungroup() %>% select(-matches(group_name)),
summarize_all(select(tb, colsthree), .funs = funsthree) %>%
ungroup() %>% select(-matches(group_name))
))

}

#With colnames
iris %>% as.tibble() %>%
group_by(Species) %>%
summarizer(colsone = contains("Sepal"),
colstwo = matches("Petal.Length"),
colsthree = c(-contains("Sepal"), -matches("Petal.Length")),
funsone = "sum",
funstwo = "mean",
funsthree = "first",
group_name = "Species")

#With indexes
iris %>% as.tibble() %>%
group_by(Species) %>%
summarizer(colsone = 1:2,
colstwo = 3,
colsthree = 4,
funsone = "sum",
funstwo = "mean",
funsthree = "first",
group_name = "Species")

Summarize multiple fields in R and suppressing values less than x

We may reshape into 'long' format with pivot_longer and then do a group by summarise to get the count of 1s and 0s

library(dplyr)
library(tidyr)
library(tibble)
df %>%
pivot_longer(cols = -ID) %>%
group_by(name) %>%
summarise(Interested = sum(value), NotInterested = n() - Interested) %>%
column_to_rownames('name')

-output

            Interested NotInterested
Resource1 3 2
Resource2 1 4
Resource3 2 3
Resource4 3 2

Or using base R

v1 <- colSums(df[-1])
cbind(Interested = v1, NotInterested = nrow(df) - v1)

-output

          Interested NotInterested
Resource1 3 2
Resource2 1 4
Resource3 2 3
Resource4 3 2

data

df <- structure(list(ID = 1:5, Resource1 = c(1L, 0L, 1L, 0L, 1L),
Resource2 = c(0L,
0L, 0L, 0L, 1L), Resource3 = c(1L, 0L, 0L, 0L, 1L), Resource4 = c(1L,
1L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L
))

How to summarize across multiple columns with condition on another (grouped) column with dplyr?

Use another across to get corresponding values in column a:c where j is minimum.

library(dplyr)

myDF %>%
group_by(i) %>%
summarize(across(where(is.numeric), median, .names="med_{col}"),
across(a:c, ~.[which.min(j)],.names = 'best_{col}'))

# i med_j med_a med_b med_c best_a best_b best_c
#* <int> <dbl> <int> <int> <int> <int> <int> <int>
#1 1 0.217 4 7 4 7 7 4
#2 2 0.689 6 6 6 8 6 8
#3 3 -0.213 5 2 7 9 1 7

To do it in the same across statement :

myDF %>% 
group_by(i) %>%
summarize(across(where(is.numeric), list(med = median,
best = ~.[which.min(j)]),
.names="{fn}_{col}"))

How to summarize based on multiple columns in R?

Expanding on @Bloxx's answer and incorporating my comment:

# Set up example data frame:
df = data.frame(year=c(rep.int(2004,2),rep.int(2005,4)),
month=((0:5%%4)-2)%%12+1,
Rainfall=seq(.5,by=0.15,length.out=6))

Now use mutate to create year2 variable:

df %>% mutate(year2 = year - (month<3)*1) # or similar depending on the problem specs

And now apply the groupby/summarise action:

df %>% mutate(year2 = year - (month<3)*1) %>% 
group_by(year2) %>%
summarise(Rainfall = mean(Rainfall))


Related Topics



Leave a reply



Submit