Summarizing multiple columns with dplyr?
In dplyr
(>=1.00) you may use across(everything()
in summarise
to apply a function to all variables:
library(dplyr)
df %>% group_by(grp) %>% summarise(across(everything(), list(mean)))
#> # A tibble: 3 x 5
#> grp a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 3.08 2.98 2.98 2.91
#> 2 2 3.03 3.04 2.97 2.87
#> 3 3 2.85 2.95 2.95 3.06
Alternatively, the purrrlyr
package provides the same functionality:
library(purrrlyr)
df %>% slice_rows("grp") %>% dmap(mean)
#> # A tibble: 3 x 5
#> grp a b c d
#> <int> <dbl> <dbl> <dbl> <dbl>
#> 1 1 3.08 2.98 2.98 2.91
#> 2 2 3.03 3.04 2.97 2.87
#> 3 3 2.85 2.95 2.95 3.06
Also don't forget about data.table
(use keyby
to sort sort groups):
library(data.table)
setDT(df)[, lapply(.SD, mean), keyby = grp]
#> grp a b c d
#> 1: 1 3.079412 2.979412 2.979412 2.914706
#> 2: 2 3.029126 3.038835 2.967638 2.873786
#> 3: 3 2.854701 2.948718 2.951567 3.062678
Let's try to compare performance.
library(dplyr)
library(purrrlyr)
library(data.table)
library(bench)
set.seed(123)
n <- 10000
df <- data.frame(
a = sample(1:5, n, replace = TRUE),
b = sample(1:5, n, replace = TRUE),
c = sample(1:5, n, replace = TRUE),
d = sample(1:5, n, replace = TRUE),
grp = sample(1:3, n, replace = TRUE)
)
dt <- setDT(df)
mark(
dplyr = df %>% group_by(grp) %>% summarise(across(everything(), list(mean))),
purrrlyr = df %>% slice_rows("grp") %>% dmap(mean),
data.table = dt[, lapply(.SD, mean), keyby = grp],
check = FALSE
)
#> # A tibble: 3 x 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 dplyr 2.81ms 2.85ms 328. NA 17.3
#> 2 purrrlyr 7.96ms 8.04ms 123. NA 24.5
#> 3 data.table 596.33µs 707.91µs 1409. NA 10.3
Summarise multiple columns using dplyr R
Try this:
df %>%
group_by(County) %>%
summarise(across(c(Submissions, Population), sum))
Group by two column and summarize multiple columns
We can use summarise
with across
from dplyr
version > = 1.00
library(dplyr)
df %>%
group_by(State, Date) %>%
summarise(across(everything(), sum, na.rm = TRUE), .groups = 'drop')
# A tibble: 6 x 4
# State Date Female Male
# <chr> <chr> <int> <int>
#1 Cali 05/06/2005 3 2
#2 Cali 10/06/2005 4 3
#3 NY 11/06/2005 10 5
#4 NY 12/06/2005 11 6
#5 Texas 01/01/2004 5 3
#6 Texas 02/01/2004 5 4
Or using aggregate
from base R
aggregate(.~ State + Date, df, sum, na.rm = TRUE)
data
df <- structure(list(State = c("Texas", "Texas", "Texas", "Cali", "Cali",
"Cali", "Cali", "NY", "NY"), Female = c(2L, 3L, 5L, 1L, 2L, 3L,
1L, 10L, 11L), Male = c(2L, 1L, 4L, 1L, 1L, 1L, 2L, 5L, 6L),
Date = c("01/01/2004", "01/01/2004", "02/01/2004", "05/06/2005",
"05/06/2005", "10/06/2005", "10/06/2005", "11/06/2005", "12/06/2005"
)), class = "data.frame", row.names = c(NA, -9L))
Summarise multiple columns in R using `case_when` and %in%
Try this:
func <- function(..., values) {
mtx <- do.call(cbind, list(...))
mtx <- array(mtx %in% values, dim = dim(mtx))
rowSums(mtx) > 0
}
data %>%
mutate(dogs = case_when(
func(labrador, beagle, corgi, values = c("Daily", "Weekly", "Monthly")) ~ "Regularly",
func(labrador, beagle, corgi, values = c("Rarely")) ~ "Rarely",
func(labrador, beagle, corgi, values = c("Never")) ~ "Never" ),
birds = case_when(
func(pigeon, sparrow, robin, values = c("Daily", "Weekly", "Monthly")) ~ "Regularly",
func(pigeon, sparrow, robin, values = c("Rarely")) ~ "Rarely",
func(pigeon, sparrow, robin, values = c("Never")) ~ "Never" )
)
# # A tibble: 3 x 8
# labrador beagle corgi pigeon sparrow robin dogs birds
# <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 Weekly Rarely Never Rarely Never Rarely Regularly Rarely
# 2 Never Never Monthly Never Never Never Regularly Never
# 3 Rarely Never Never Weekly Never Daily Rarely Regularly
An alternative that does not require func
, instead double-pivot
ing.
library(tidyr) # pivot_*
data <- mutate(data, rn = row_number())
data %>%
pivot_longer(-rn) %>%
mutate(species = case_when(name %in% c("labrador", "beagle", "corgi") ~ "dogs", name %in% c("pigeon", "sparrow", "robin") ~ "birds", TRUE ~ "other")) %>%
group_by(rn, species) %>%
summarize(total = case_when(any(value %in% c("Daily", "Weekly", "Monthly")) ~ "Regularly", any(value %in% c("Rarely")) ~ "Rarely", any(value %in% c("Never")) ~ "Never", TRUE ~ "unk")) %>%
ungroup() %>%
pivot_wider(rn, names_from = species, values_from = total) %>%
left_join(data, ., by = "rn")
# # A tibble: 3 x 9
# labrador beagle corgi pigeon sparrow robin rn birds dogs
# <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr> <chr>
# 1 Weekly Rarely Never Rarely Never Rarely 1 Rarely Regularly
# 2 Never Never Monthly Never Never Never 2 Never Regularly
# 3 Rarely Never Never Weekly Never Daily 3 Regularly Rarely
Using dplyr summarize with different operations for multiple columns
As other people have mentioned, this is normally done by calling summarize_each
/ summarize_at
/ summarize_if
for every group of columns that you want to apply the summarizing function to. As far as I know, you would have to create a custom function that performs summarizations to each subset. You can for example set the colnames in such way that you can use the select helpers (e.g. contains()
) to filter just the columns that you want to apply the function to. If not, then you can set the specific column numbers that you want to summarize.
For the example you mentioned, you could try the following:
summarizer <- function(tb, colsone, colstwo, colsthree,
funsone, funstwo, funsthree, group_name) {
return(bind_cols(
summarize_all(select(tb, colsone), .funs = funsone),
summarize_all(select(tb, colstwo), .funs = funstwo) %>%
ungroup() %>% select(-matches(group_name)),
summarize_all(select(tb, colsthree), .funs = funsthree) %>%
ungroup() %>% select(-matches(group_name))
))
}
#With colnames
iris %>% as.tibble() %>%
group_by(Species) %>%
summarizer(colsone = contains("Sepal"),
colstwo = matches("Petal.Length"),
colsthree = c(-contains("Sepal"), -matches("Petal.Length")),
funsone = "sum",
funstwo = "mean",
funsthree = "first",
group_name = "Species")
#With indexes
iris %>% as.tibble() %>%
group_by(Species) %>%
summarizer(colsone = 1:2,
colstwo = 3,
colsthree = 4,
funsone = "sum",
funstwo = "mean",
funsthree = "first",
group_name = "Species")
Summarize multiple fields in R and suppressing values less than x
We may reshape into 'long' format with pivot_longer
and then do a group by summarise
to get the count of 1s and 0s
library(dplyr)
library(tidyr)
library(tibble)
df %>%
pivot_longer(cols = -ID) %>%
group_by(name) %>%
summarise(Interested = sum(value), NotInterested = n() - Interested) %>%
column_to_rownames('name')
-output
Interested NotInterested
Resource1 3 2
Resource2 1 4
Resource3 2 3
Resource4 3 2
Or using base R
v1 <- colSums(df[-1])
cbind(Interested = v1, NotInterested = nrow(df) - v1)
-output
Interested NotInterested
Resource1 3 2
Resource2 1 4
Resource3 2 3
Resource4 3 2
data
df <- structure(list(ID = 1:5, Resource1 = c(1L, 0L, 1L, 0L, 1L),
Resource2 = c(0L,
0L, 0L, 0L, 1L), Resource3 = c(1L, 0L, 0L, 0L, 1L), Resource4 = c(1L,
1L, 0L, 0L, 1L)), class = "data.frame", row.names = c(NA, -5L
))
How to summarize across multiple columns with condition on another (grouped) column with dplyr?
Use another across
to get corresponding values in column a:c
where j
is minimum.
library(dplyr)
myDF %>%
group_by(i) %>%
summarize(across(where(is.numeric), median, .names="med_{col}"),
across(a:c, ~.[which.min(j)],.names = 'best_{col}'))
# i med_j med_a med_b med_c best_a best_b best_c
#* <int> <dbl> <int> <int> <int> <int> <int> <int>
#1 1 0.217 4 7 4 7 7 4
#2 2 0.689 6 6 6 8 6 8
#3 3 -0.213 5 2 7 9 1 7
To do it in the same across
statement :
myDF %>%
group_by(i) %>%
summarize(across(where(is.numeric), list(med = median,
best = ~.[which.min(j)]),
.names="{fn}_{col}"))
How to summarize based on multiple columns in R?
Expanding on @Bloxx's answer and incorporating my comment:
# Set up example data frame:
df = data.frame(year=c(rep.int(2004,2),rep.int(2005,4)),
month=((0:5%%4)-2)%%12+1,
Rainfall=seq(.5,by=0.15,length.out=6))
Now use mutate
to create year2
variable:
df %>% mutate(year2 = year - (month<3)*1) # or similar depending on the problem specs
And now apply the groupby/summarise action:
df %>% mutate(year2 = year - (month<3)*1) %>%
group_by(year2) %>%
summarise(Rainfall = mean(Rainfall))
Related Topics
How to Select Variables in an R Dataframe Whose Names Contain a Particular String
Splitting a Large Data Frame into Smaller Segments
How to Test When Condition Returns Numeric(0) in R
Deleting Rows in R Based on Values Over Multiple Columns
Add X and Y Axis to All Facet_Wrap
R: Error in Usemethod("Group_By_"):Applied to an Object of Class
How to Convert Only Some Positive Numbers to Negative Numbers (Conditional Recoding)
How to Replace Negative Values in a Dataframe Column With a Different Value
Error in Confusion Matrix:The Data and Reference Factors Must Have the Same Number of Levels
Too Much White Space Between Caption and Figure Produced by Tikzdevice and Ggplot2 in Latex
Delete Rows With Negative Values
How to Get to the Next Line in the R Command Prompt Without Executing
Append Data Frames Together in a for Loop
Concatenating Two Text Columns in Dplyr
How to Convert a Data Frame Column to Numeric Type
Combing a Categorical Variable to Create a New Categorical Variable in R