Summarizing Multiple Columns With Dplyr

Summarizing multiple columns with dplyr?

In dplyr (>=1.00) you may use across(everything() in summarise to apply a function to all variables:

library(dplyr)

df %>% group_by(grp) %>% summarise(across(everything(), list(mean)))
#> # A tibble: 3 x 5
#>     grp     a     b     c     d
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1  3.08  2.98  2.98  2.91
#> 2     2  3.03  3.04  2.97  2.87
#> 3     3  2.85  2.95  2.95  3.06

Alternatively, the purrrlyr package provides the same functionality:

library(purrrlyr)
df %>% slice_rows("grp") %>% dmap(mean)
#> # A tibble: 3 x 5
#>     grp     a     b     c     d
#>   <int> <dbl> <dbl> <dbl> <dbl>
#> 1     1  3.08  2.98  2.98  2.91
#> 2     2  3.03  3.04  2.97  2.87
#> 3     3  2.85  2.95  2.95  3.06

Also don't forget about data.table (use keyby to sort sort groups):

library(data.table)
setDT(df)[, lapply(.SD, mean), keyby = grp]
#>    grp        a        b        c        d
#> 1:   1 3.079412 2.979412 2.979412 2.914706
#> 2:   2 3.029126 3.038835 2.967638 2.873786
#> 3:   3 2.854701 2.948718 2.951567 3.062678

Let's try to compare performance.

library(dplyr)
library(purrrlyr)
library(data.table)
library(bench)
set.seed(123)
n <- 10000
df <- data.frame(
  a = sample(1:5, n, replace = TRUE), 
  b = sample(1:5, n, replace = TRUE), 
  c = sample(1:5, n, replace = TRUE), 
  d = sample(1:5, n, replace = TRUE), 
  grp = sample(1:3, n, replace = TRUE)
)
dt <- setDT(df)
mark(
  dplyr = df %>% group_by(grp) %>% summarise(across(everything(), list(mean))),
  purrrlyr = df %>% slice_rows("grp") %>% dmap(mean),
  data.table = dt[, lapply(.SD, mean), keyby = grp],
  check = FALSE
)
#> # A tibble: 3 x 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 dplyr        2.81ms   2.85ms      328.        NA     17.3
#> 2 purrrlyr     7.96ms   8.04ms      123.        NA     24.5
#> 3 data.table 596.33µs 707.91µs     1409.        NA     10.3

Summarise multiple columns using dplyr R

Try this:

df %>%
  group_by(County) %>%
  summarise(across(c(Submissions, Population), sum))

Group by two column and summarize multiple columns

We can use summarise with across from dplyr version > = 1.00

library(dplyr)
df %>%
   group_by(State, Date) %>%
   summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
# A tibble: 6 x 4
#  State Date       Female  Male
#  <chr> <chr>       <int> <int>
#1 Cali  05/06/2005      3     2
#2 Cali  10/06/2005      4     3
#3 NY    11/06/2005     10     5
#4 NY    12/06/2005     11     6
#5 Texas 01/01/2004      5     3
#6 Texas 02/01/2004      5     4

Or using aggregate from base R

aggregate(.~ State + Date, df, sum, na.rm = TRUE)

data

df <-  structure(list(State = c("Texas", "Texas", "Texas", "Cali", "Cali", 
"Cali", "Cali", "NY", "NY"), Female = c(2L, 3L, 5L, 1L, 2L, 3L, 
1L, 10L, 11L), Male = c(2L, 1L, 4L, 1L, 1L, 1L, 2L, 5L, 6L), 
    Date = c("01/01/2004", "01/01/2004", "02/01/2004", "05/06/2005", 
    "05/06/2005", "10/06/2005", "10/06/2005", "11/06/2005", "12/06/2005"
    )), class = "data.frame", row.names = c(NA, -9L))

Summarise multiple columns in R using `case_when` and %in%

Try this:

func <- function(..., values) {
  mtx <- do.call(cbind, list(...))
  mtx <- array(mtx %in% values, dim = dim(mtx))
  rowSums(mtx) > 0
}

data %>%
  mutate(dogs = case_when(
    func(labrador, beagle, corgi, values = c("Daily", "Weekly", "Monthly")) ~ "Regularly", 
    func(labrador, beagle, corgi, values = c("Rarely")) ~ "Rarely", 
    func(labrador, beagle, corgi, values = c("Never")) ~ "Never" ),
  birds = case_when(
    func(pigeon, sparrow, robin, values = c("Daily", "Weekly", "Monthly")) ~ "Regularly", 
    func(pigeon, sparrow, robin, values = c("Rarely")) ~ "Rarely", 
    func(pigeon, sparrow, robin, values = c("Never")) ~ "Never" )
  )
# # A tibble: 3 x 8
#   labrador beagle corgi   pigeon sparrow robin  dogs      birds    
#   <chr>    <chr>  <chr>   <chr>  <chr>   <chr>  <chr>     <chr>    
# 1 Weekly   Rarely Never   Rarely Never   Rarely Regularly Rarely   
# 2 Never    Never  Monthly Never  Never   Never  Regularly Never    
# 3 Rarely   Never  Never   Weekly Never   Daily  Rarely    Regularly

An alternative that does not require func, instead double-pivoting.

library(tidyr) # pivot_*
data <- mutate(data, rn = row_number())
data %>%
  pivot_longer(-rn) %>%
  mutate(species = case_when(name %in% c("labrador", "beagle", "corgi") ~ "dogs", name %in% c("pigeon", "sparrow", "robin") ~ "birds", TRUE ~ "other")) %>%
  group_by(rn, species) %>%
  summarize(total = case_when(any(value %in% c("Daily", "Weekly", "Monthly")) ~ "Regularly", any(value %in% c("Rarely")) ~ "Rarely", any(value %in% c("Never")) ~ "Never", TRUE ~ "unk")) %>%
  ungroup() %>%
  pivot_wider(rn, names_from = species, values_from = total) %>%
  left_join(data, ., by = "rn")
# # A tibble: 3 x 9
#   labrador beagle corgi   pigeon sparrow robin     rn birds     dogs     
#   <chr>    <chr>  <chr>   <chr>  <chr>   <chr>  <int> <chr>     <chr>    
# 1 Weekly   Rarely Never   Rarely Never   Rarely     1 Rarely    Regularly
# 2 Never    Never  Monthly Never  Never   Never      2 Never     Regularly
# 3 Rarely   Never  Never   Weekly Never   Daily      3 Regularly Rarely

Using dplyr summarize with different operations for multiple columns

As other people have mentioned, this is normally done by calling summarize_each / summarize_at / summarize_if for every group of columns that you want to apply the summarizing function to. As far as I know, you would have to create a custom function that performs summarizations to each subset. You can for example set the colnames in such way that you can use the select helpers (e.g. contains()) to filter just the columns that you want to apply the function to. If not, then you can set the specific column numbers that you want to summarize.

For the example you mentioned, you could try the following:

summarizer <- function(tb, colsone, colstwo, colsthree, 
                       funsone, funstwo, funsthree, group_name) {

return(bind_cols(
    summarize_all(select(tb, colsone), .funs = funsone),
    summarize_all(select(tb, colstwo), .funs = funstwo) %>% 
      ungroup() %>% select(-matches(group_name)),
    summarize_all(select(tb, colsthree), .funs = funsthree) %>% 
      ungroup() %>% select(-matches(group_name)) 
))

}

#With colnames
iris %>% as.tibble() %>% 
  group_by(Species) %>% 
  summarizer(colsone = contains("Sepal"), 
         colstwo = matches("Petal.Length"), 
         colsthree = c(-contains("Sepal"), -matches("Petal.Length")),
         funsone = "sum", 
         funstwo = "mean",
         funsthree = "first",
         group_name = "Species")

#With indexes
iris %>% as.tibble() %>% 
 group_by(Species) %>% 
 summarizer(colsone = 1:2, 
         colstwo = 3, 
         colsthree = 4,
         funsone = "sum", 
         funstwo = "mean",
         funsthree = "first",
         group_name = "Species")

Summarize multiple fields in R and suppressing values less than x

We may reshape into 'long' format with pivot_longer and then do a group by summarise to get the count of 1s and 0s

library(dplyr)
library(tidyr)
library(tibble)
df %>% 
   pivot_longer(cols = -ID) %>%
   group_by(name) %>%
   summarise(Interested = sum(value), NotInterested = n() - Interested) %>%
   column_to_rownames('name')

-output

            Interested NotInterested
Resource1          3             2
Resource2          1             4
Resource3          2             3
Resource4          3             2

Or using base R

v1 <- colSums(df[-1])
cbind(Interested = v1, NotInterested = nrow(df) - v1)

-output

          Interested NotInterested
Resource1          3             2
Resource2          1             4
Resource3          2             3
Resource4          3             2

data

df <- structure(list(ID = 1:5, Resource1 = c(1L, 0L, 1L, 0L, 1L),
 Resource2 = c(0L, 
0L, 0L, 0L, 1L), Resource3 = c(1L, 0L, 0L, 0L, 1L), Resource4 = c(1L, 
1L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L
))

How to summarize across multiple columns with condition on another (grouped) column with dplyr?

Use another across to get corresponding values in column a:c where j is minimum.

library(dplyr)

myDF %>% 
  group_by(i) %>% 
  summarize(across(where(is.numeric), median, .names="med_{col}"),
            across(a:c,  ~.[which.min(j)],.names = 'best_{col}'))

#      i  med_j med_a med_b med_c best_a best_b best_c
#* <int>  <dbl> <int> <int> <int>  <int>  <int>  <int>
#1     1  0.217     4     7     4      7      7      4
#2     2  0.689     6     6     6      8      6      8
#3     3 -0.213     5     2     7      9      1      7

To do it in the same across statement :

myDF %>% 
  group_by(i) %>% 
  summarize(across(where(is.numeric), list(med = median, 
                                           best = ~.[which.min(j)]), 
                                      .names="{fn}_{col}"))

How to summarize based on multiple columns in R?

Expanding on @Bloxx's answer and incorporating my comment:

# Set up example data frame:
df = data.frame(year=c(rep.int(2004,2),rep.int(2005,4)), 
                month=((0:5%%4)-2)%%12+1,
                Rainfall=seq(.5,by=0.15,length.out=6))

Now use mutate to create year2 variable:

df %>% mutate(year2 = year - (month<3)*1) # or similar depending on the problem specs

And now apply the groupby/summarise action:

df %>% mutate(year2 = year - (month<3)*1) %>% 
       group_by(year2) %>% 
       summarise(Rainfall = mean(Rainfall))

Summarizing Multiple Columns With Dplyr