Summarise_At Using Different Functions for Different Variables

summarise_at using different functions for different variables

Here is one idea.

library(tidyverse)

df_mean <- df %>%
  group_by(category) %>%
  summarize_at(vars(x), funs(mean(.)))

df_median <- df %>%
  group_by(category) %>%
  summarize_at(vars(y), funs(median(.)))

df_first <- df %>%
  group_by(category) %>%
  summarize_at(vars(z), funs(first(.)))

df_summary <- reduce(list(df_mean, df_median, df_first), 
                     left_join, by = "category")

Like you said, there is no need to use summarise_at for this example. However, if you have a lot of columns need to be summarized by different functions, this strategy may work. You will need to specify the columns in the vars(...) for each summarize_at. The rule is the same as the dplyr::select function.

Update

Here is another idea. Define a function which modifies the summarise_at function, and then use map2 to apply this function with a look-up list showing variables and associated functions to apply. In this example, I applied mean to x and y column and median to z.

# Define a function
summarise_at_fun <- function(variable, func, data){
  data2 <- data %>%
    summarise_at(vars(variable), funs(get(func)(.)))
  return(data2)
}

# Group the data
df2 <- df %>% group_by(category)

# Create a look-up list with function names and variable to apply
look_list <- list(mean = c("x", "y"),
                  median = "z")

# Apply the summarise_at_fun
map2(look_list, names(look_list), summarise_at_fun, data = df2) %>%
  reduce(left_join, by = "category")

# A tibble: 3 x 4
  category     x     y     z
     <chr> <dbl> <dbl> <dbl>
1        a     6     6     0
2        b     5     3     8
3        c     2     6     1

Efficient summarise of different columns with different functions with dplyr

Here is the tidyverse approach adapted from @MichaelDewar's answer, it's tidier but I don't think there is any real improvement in the efficiency. Besides, a dataframe with 100k rows is not that big a deal in my opinion. I think a tidyverse solution is just fine.

library(dplyr)

data %>% 
  group_by(id, date) %>% 
  summarise(
    across(c(var1, var2), ~paste(unique(.), collapse = " AND ")), 
    across(var3, max, na.rm = T), 
    count = n(), .groups = "keep"
  )

But if you really want to improve the efficiency, perhaps try this data.table solution

library(data.table)

setDT(data)[, c(
  lapply(c(var1 = "var1", var2 = "var2"), function(x) paste(unique(.SD[[x]]), collapse = " AND ")), 
  list(var3 = max(var3, na.rm = T), count = .N)
), by = c("id", "date")]

Benchmark

set.seed(2020)
data2 <- data[sample.int(nrow(data), 1e5, T), ]
data22 <- data.table::copy(data2)

f1 <- 
  . %>% 
  group_by(id, date) %>%
  {data.frame(
    summarise_at(., vars(var1, var2), list(~ paste(unique(.), collapse = " AND "))), # return character string
    summarise_at(., vars(var3), list(~ max(., na.rm = T))),#, # return max in group
    summarise(., count = n(), .groups = "keep") # return count of cases in group
  )} %>% 
  select(-matches("[.]1$|[.]2$|[.]3$")) %>% # remove unwanted columns
  as_tibble()

f2 <- 
  . %>% 
  group_by(id, date) %>% 
  summarise(
    across(c(var1, var2), ~paste(unique(.), collapse = " AND ")), 
    across(var3, max, na.rm = T), 
    count = n(), 
    .groups = "keep"
  )

f3 <- function(dt) {
  setDT(dt)[, c(
    lapply(c(var1 = "var1", var2 = "var2"), function(x) paste(unique(.SD[[x]]), collapse = " AND ")), 
    list(var3 = max(var3, na.rm = T), count = .N)
  ), by = c("id", "date")]
}

microbenchmark::microbenchmark(f1(data2), f2(data2), f3(data22))

Result

Unit: milliseconds
       expr     min       lq      mean   median       uq     max neval cld
  f1(data2) 19.6730 20.27990 20.841344 20.50850 20.85045 29.2799   100   c
  f2(data2) 13.5455 14.09240 14.705967 14.34585 14.64625 20.5914   100  b 
 f3(data22)  6.9186  7.80615  8.598227  8.32035  8.68040 15.8358   100 a

Can I use summarise_at for existing variables while adding other variables at the same time?

The only way I can think of (at the moment) is the store the data immediately before your first summary, then run two summary verbs, and join them on the grouped variable. For instance:

library(dplyr)

grouped_data <- group_by(mtcars, cyl)
left_join(
  summarize(grouped_data, blah = mean(disp)),
  summarize_at(grouped_data, vars(vs:carb), sum),
  by = "cyl")
# # A tibble: 3 x 6
#     cyl  blah    vs    am  gear  carb
#   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1     4  105.    10     8    45    17
# 2     6  183.     4     3    27    24
# 3     8  353.     0     2    46    49

Summing and calculating mean for different columns in dplyr

Here's a way that is not covered in my linked duplicate in comments. This is a horrible way but could be okay if data is small and/or your summary functions are fast. Basically I am applying all functions to all desired variables and then selecting the needed combinations.

Personally, I don't think this worth it but just putting it out here -

mtcars %>% 
  group_by(cyl) %>% 
  summarize_at(c(3,4,5,6), list(mean = ~mean(.), sum = ~sum(.), median = ~median(.))) %>% 
  select(cyl, hp_mean, drat_sum, wt_median, qsec_median) 
  # select can be automated based on patterns

# A tibble: 3 x 5
    cyl hp_mean drat_sum wt_median qsec_median
  <dbl>   <dbl>    <dbl>     <dbl>       <dbl>
1     4    82.6     44.8      2.2         18.9
2     6   122.      25.1      3.22        18.3
3     8   209.      45.2      3.76        17.2

Keeping excluded variables in summarise_at

In the devel version of dplyr, we can use summarise with across, but still not sure what values we want for 'Depth', 'cps', so it is converted to a list

library(dplyr)
df %>%
    summarise(across(Al:Fe, mean), across(Depth:cps, list))
# A tibble: 1 x 5
#     Al     Si     Fe Depth      cps       
#   <dbl>  <dbl>  <dbl> <list>     <list>    
#1 -0.438 -0.118 -0.590 <dbl [11]> <dbl [11]>

Or to get the first row

df %>%
    summarise(across(Al:Fe, mean), across(Depth:cps, first))
# A tibble: 1 x 5
#      Al     Si     Fe Depth   cps
#  <dbl>  <dbl>  <dbl> <dbl> <dbl>
#1 -0.438 -0.118 -0.590     0 51432

Or to subset the median element of 'Depth'

df %>% 
    summarise(across(Al:Fe, mean), across(Depth:cps, ~ .[Depth == median(Depth)]))
# A tibble: 1 x 5
#      Al     Si     Fe Depth   cps
#   <dbl>  <dbl>  <dbl> <dbl> <dbl>
#1 -0.438 -0.118 -0.590     1 51753

If we need the first row, then mutate and slice the first row

df %>%
    mutate_at(vars(-c(Depth, cps)), mean) %>%
    slice(1)
# A tibble: 1 x 5
#  Depth   cps     Al     Si     Fe
#  <dbl> <dbl>  <dbl>  <dbl>  <dbl>
#1     0 51432 -0.438 -0.118 -0.590

Or if it needs to be the median row

df %>%
  mutate_at(vars(-c(Depth, cps)), mean) %>%
  filter(Depth == median(Depth))
# A tibble: 1 x 5
# Depth   cps     Al     Si     Fe
#   <dbl> <dbl>  <dbl>  <dbl>  <dbl>
#1     1 51753 -0.438 -0.118 -0.590

R Summary table in percentage with summarise_at or _all using 3 different functions and reduce inner join

You ca use across() to apply a summarizing function to a set of columns

df_ask %>%
  mutate(Yes.Response = Response %in% c("yes", "yes, change request"),
         No.Response = Response %in% c( "no", "declined","no response"),
         Is.Public = Private == "public",
         Is.Private = Private == "private") %>%
  group_by(Member, Year) %>%
  summarise(Total.Name = sum( !is.na( Name )),
            across( where(is.logical), ~paste0( round( 100 * sum(.) / Total.Name), "%")))

# A tibble: 5 x 7
# Groups:   Member [2]
  Member  Year Total.Name Yes.Response No.Response Is.Public Is.Private
  <chr>  <dbl>      <int> <chr>        <chr>       <chr>     <chr>     
1 API     2017          3 33%          67%         33%       33%       
2 API     2018          2 50%          50%         50%       50%       
3 KARA    2017          3 33%          67%         33%       33%       
4 KARA    2018          3 67%          33%         67%       33%       
5 KARA    2019          1 0%           100%        0%        100%

Summarise_At Using Different Functions for Different Variables