summarise_at using different functions for different variables
Here is one idea.
library(tidyverse)
df_mean <- df %>%
group_by(category) %>%
summarize_at(vars(x), funs(mean(.)))
df_median <- df %>%
group_by(category) %>%
summarize_at(vars(y), funs(median(.)))
df_first <- df %>%
group_by(category) %>%
summarize_at(vars(z), funs(first(.)))
df_summary <- reduce(list(df_mean, df_median, df_first),
left_join, by = "category")
Like you said, there is no need to use summarise_at
for this example. However, if you have a lot of columns need to be summarized by different functions, this strategy may work. You will need to specify the columns in the vars(...)
for each summarize_at
. The rule is the same as the dplyr::select
function.
Update
Here is another idea. Define a function which modifies the summarise_at
function, and then use map2
to apply this function with a look-up list showing variables and associated functions to apply. In this example, I applied mean
to x
and y
column and median
to z
.
# Define a function
summarise_at_fun <- function(variable, func, data){
data2 <- data %>%
summarise_at(vars(variable), funs(get(func)(.)))
return(data2)
}
# Group the data
df2 <- df %>% group_by(category)
# Create a look-up list with function names and variable to apply
look_list <- list(mean = c("x", "y"),
median = "z")
# Apply the summarise_at_fun
map2(look_list, names(look_list), summarise_at_fun, data = df2) %>%
reduce(left_join, by = "category")
# A tibble: 3 x 4
category x y z
<chr> <dbl> <dbl> <dbl>
1 a 6 6 0
2 b 5 3 8
3 c 2 6 1
Efficient summarise of different columns with different functions with dplyr
Here is the tidyverse
approach adapted from @MichaelDewar's answer, it's tidier but I don't think there is any real improvement in the efficiency. Besides, a dataframe with 100k rows is not that big a deal in my opinion. I think a tidyverse
solution is just fine.
library(dplyr)
data %>%
group_by(id, date) %>%
summarise(
across(c(var1, var2), ~paste(unique(.), collapse = " AND ")),
across(var3, max, na.rm = T),
count = n(), .groups = "keep"
)
But if you really want to improve the efficiency, perhaps try this data.table solution
library(data.table)
setDT(data)[, c(
lapply(c(var1 = "var1", var2 = "var2"), function(x) paste(unique(.SD[[x]]), collapse = " AND ")),
list(var3 = max(var3, na.rm = T), count = .N)
), by = c("id", "date")]
Benchmark
set.seed(2020)
data2 <- data[sample.int(nrow(data), 1e5, T), ]
data22 <- data.table::copy(data2)
f1 <-
. %>%
group_by(id, date) %>%
{data.frame(
summarise_at(., vars(var1, var2), list(~ paste(unique(.), collapse = " AND "))), # return character string
summarise_at(., vars(var3), list(~ max(., na.rm = T))),#, # return max in group
summarise(., count = n(), .groups = "keep") # return count of cases in group
)} %>%
select(-matches("[.]1$|[.]2$|[.]3$")) %>% # remove unwanted columns
as_tibble()
f2 <-
. %>%
group_by(id, date) %>%
summarise(
across(c(var1, var2), ~paste(unique(.), collapse = " AND ")),
across(var3, max, na.rm = T),
count = n(),
.groups = "keep"
)
f3 <- function(dt) {
setDT(dt)[, c(
lapply(c(var1 = "var1", var2 = "var2"), function(x) paste(unique(.SD[[x]]), collapse = " AND ")),
list(var3 = max(var3, na.rm = T), count = .N)
), by = c("id", "date")]
}
microbenchmark::microbenchmark(f1(data2), f2(data2), f3(data22))
Result
Unit: milliseconds
expr min lq mean median uq max neval cld
f1(data2) 19.6730 20.27990 20.841344 20.50850 20.85045 29.2799 100 c
f2(data2) 13.5455 14.09240 14.705967 14.34585 14.64625 20.5914 100 b
f3(data22) 6.9186 7.80615 8.598227 8.32035 8.68040 15.8358 100 a
Can I use summarise_at for existing variables while adding other variables at the same time?
The only way I can think of (at the moment) is the store the data immediately before your first summary, then run two summary verbs, and join them on the grouped variable. For instance:
library(dplyr)
grouped_data <- group_by(mtcars, cyl)
left_join(
summarize(grouped_data, blah = mean(disp)),
summarize_at(grouped_data, vars(vs:carb), sum),
by = "cyl")
# # A tibble: 3 x 6
# cyl blah vs am gear carb
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 4 105. 10 8 45 17
# 2 6 183. 4 3 27 24
# 3 8 353. 0 2 46 49
Summing and calculating mean for different columns in dplyr
Here's a way that is not covered in my linked duplicate in comments. This is a horrible way but could be okay if data is small and/or your summary functions are fast. Basically I am applying all functions to all desired variables and then selecting the needed combinations.
Personally, I don't think this worth it but just putting it out here -
mtcars %>%
group_by(cyl) %>%
summarize_at(c(3,4,5,6), list(mean = ~mean(.), sum = ~sum(.), median = ~median(.))) %>%
select(cyl, hp_mean, drat_sum, wt_median, qsec_median)
# select can be automated based on patterns
# A tibble: 3 x 5
cyl hp_mean drat_sum wt_median qsec_median
<dbl> <dbl> <dbl> <dbl> <dbl>
1 4 82.6 44.8 2.2 18.9
2 6 122. 25.1 3.22 18.3
3 8 209. 45.2 3.76 17.2
Keeping excluded variables in summarise_at
In the devel
version of dplyr
, we can use summarise
with across
, but still not sure what values we want for 'Depth', 'cps', so it is converted to a list
library(dplyr)
df %>%
summarise(across(Al:Fe, mean), across(Depth:cps, list))
# A tibble: 1 x 5
# Al Si Fe Depth cps
# <dbl> <dbl> <dbl> <list> <list>
#1 -0.438 -0.118 -0.590 <dbl [11]> <dbl [11]>
Or to get the first
row
df %>%
summarise(across(Al:Fe, mean), across(Depth:cps, first))
# A tibble: 1 x 5
# Al Si Fe Depth cps
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 -0.438 -0.118 -0.590 0 51432
Or to subset the median
element of 'Depth'
df %>%
summarise(across(Al:Fe, mean), across(Depth:cps, ~ .[Depth == median(Depth)]))
# A tibble: 1 x 5
# Al Si Fe Depth cps
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 -0.438 -0.118 -0.590 1 51753
If we need the first row, then mutate
and slice
the first row
df %>%
mutate_at(vars(-c(Depth, cps)), mean) %>%
slice(1)
# A tibble: 1 x 5
# Depth cps Al Si Fe
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 51432 -0.438 -0.118 -0.590
Or if it needs to be the median
row
df %>%
mutate_at(vars(-c(Depth, cps)), mean) %>%
filter(Depth == median(Depth))
# A tibble: 1 x 5
# Depth cps Al Si Fe
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 51753 -0.438 -0.118 -0.590
R Summary table in percentage with summarise_at or _all using 3 different functions and reduce inner join
You ca use across()
to apply a summarizing function to a set of columns
df_ask %>%
mutate(Yes.Response = Response %in% c("yes", "yes, change request"),
No.Response = Response %in% c( "no", "declined","no response"),
Is.Public = Private == "public",
Is.Private = Private == "private") %>%
group_by(Member, Year) %>%
summarise(Total.Name = sum( !is.na( Name )),
across( where(is.logical), ~paste0( round( 100 * sum(.) / Total.Name), "%")))
# A tibble: 5 x 7
# Groups: Member [2]
Member Year Total.Name Yes.Response No.Response Is.Public Is.Private
<chr> <dbl> <int> <chr> <chr> <chr> <chr>
1 API 2017 3 33% 67% 33% 33%
2 API 2018 2 50% 50% 50% 50%
3 KARA 2017 3 33% 67% 33% 33%
4 KARA 2018 3 67% 33% 67% 33%
5 KARA 2019 1 0% 100% 0% 100%
Related Topics
R-Project Filepath from Concatenation
Using Lapply and Read.CSV on Multiple Files (In R)
Control Number of Decimal Places on Xtable Output in R
Error in Install.Packages:Cannot Remove Prior Installation of Package 'Dbi'
Hide Certain Columns in a Responsive Data Table Using Dt Package
How to Add a Prefix to Several Variable Names Using Dplyr
When Using Ggplot in R, How to Remove Margins Surrounding the Plot Area
How to Create Base R Plot 'Type = B' Equivalent in Ggplot2
Hollow Histogram or Binning for Geom_Step
Justification of Multiple Legends in Ggmap/Ggplot2
Replacing All Occurrences of a Pattern in a String
R Shiny Conditionalpanel Output Value
How to Read \" Double-Quote Escaped Values with Read.Table in R