R Group by Aggregate

How to sum a variable by group

Using aggregate:

aggregate(x$Frequency, by=list(Category=x$Category), FUN=sum)
Category x
1 First 30
2 Second 5
3 Third 34

In the example above, multiple dimensions can be specified in the list. Multiple aggregated metrics of the same data type can be incorporated via cbind:

aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) ...

(embedding @thelatemail comment), aggregate has a formula interface too

aggregate(Frequency ~ Category, x, sum)

Or if you want to aggregate multiple columns, you could use the . notation (works for one column too)

aggregate(. ~ Category, x, sum)

or tapply:

tapply(x$Frequency, x$Category, FUN=sum)
First Second Third
30 5 34

Using this data:

x <- data.frame(Category=factor(c("First", "First", "First", "Second",
"Third", "Third", "Second")),
Frequency=c(10,15,5,2,14,20,3))

Using aggregate/group_by in R to group data and give a count for each factor variable?

With dpylr::count and tidyr::pivot_wider you could do:

library(dplyr)
library(tidyr)

telangiectasia_tumour_data %>%
count(Telangiectasia_time, grade) %>%
pivot_wider(names_from = grade, values_from = n, names_prefix = "grade", values_fill = 0)
#> # A tibble: 4 × 3
#> Telangiectasia_time grade0 grade1
#> <chr> <int> <int>
#> 1 telangiectasia_tumour_0 1 1
#> 2 telangiectasia_tumour_1 1 1
#> 3 telangiectasia_tumour_12 1 0
#> 4 telangiectasia_tumour_24 1 0

DATA

telangiectasia_tumour_data <- structure(list(Telangiectasia_time = c(
"telangiectasia_tumour_0",
"telangiectasia_tumour_1", "telangiectasia_tumour_12", "telangiectasia_tumour_24",
"telangiectasia_tumour_0", "telangiectasia_tumour_1"
), grade = c(
0L,
0L, 0L, 0L, 1L, 1L
)), class = "data.frame", row.names = c(
"1",
"2", "3", "4", "5", "6"
))

R group by aggregate

Here's my solution using aggregate.

First, load the data:

df <- read.table(text = 
"SessionID Price
'1' '624.99'
'1' '697.99'
'1' '649.00'
'7' '779.00'
'7' '710.00'
'7' '2679.50'", header = TRUE)

Then aggregate and match it back to the original data.frame:

tmp <- aggregate(Price ~ SessionID, df, function(x) c(Min = min(x), Max = max(x)))
df <- cbind(df, tmp[match(df$SessionID, tmp$SessionID), 2])
print(df)
# SessionID Price Min Max
#1 1 624.99 624.99 697.99
#2 1 697.99 624.99 697.99
#3 1 649.00 624.99 697.99
#4 7 779.00 710.00 2679.50
#5 7 710.00 710.00 2679.50
#6 7 2679.50 710.00 2679.50

EDIT: As per the comment below, you might wonder why this works. It indeed is somewhat weird. But remember that a data.frame just is a fancy list. Try to call str(tmp), and you'll see that the Price column itself is 2 by 2 numeric matrix. It gets confusing as the print.data.frame knows how to handle this and so print(tmp) looks like there are 3 columns. Anyway, tmp[2] simply access the second column/entry of the data.frame/list and returns that 1 column data.frame while tmp[,2] access the second column and return the data type stored.

R sum a variable by two groups

You can group_by ID and Year then use sum within summarise

library(dplyr)

txt <- "ID Year Amount
3 2000 45
3 2000 55
3 2002 10
3 2002 10
3 2004 30
4 2000 25
4 2002 40
4 2002 15
4 2004 45
4 2004 50"

df <- read.table(text = txt, header = TRUE)

df %>%
group_by(ID, Year) %>%
summarise(Total = sum(Amount, na.rm = TRUE))
#> # A tibble: 6 x 3
#> # Groups: ID [?]
#> ID Year Total
#> <int> <int> <int>
#> 1 3 2000 100
#> 2 3 2002 20
#> 3 3 2004 30
#> 4 4 2000 25
#> 5 4 2002 55
#> 6 4 2004 95

If you have more than one Amount column & want to apply more than one function, you can use either summarise_if or summarise_all

df %>% 
group_by(ID, Year) %>%
summarise_if(is.numeric, funs(sum, mean))
#> # A tibble: 6 x 4
#> # Groups: ID [?]
#> ID Year sum mean
#> <int> <int> <int> <dbl>
#> 1 3 2000 100 50
#> 2 3 2002 20 10
#> 3 3 2004 30 30
#> 4 4 2000 25 25
#> 5 4 2002 55 27.5
#> 6 4 2004 95 47.5

df %>%
group_by(ID, Year) %>%
summarise_all(funs(sum, mean, max, min))
#> # A tibble: 6 x 6
#> # Groups: ID [?]
#> ID Year sum mean max min
#> <int> <int> <int> <dbl> <dbl> <dbl>
#> 1 3 2000 100 50 55 45
#> 2 3 2002 20 10 10 10
#> 3 3 2004 30 30 30 30
#> 4 4 2000 25 25 25 25
#> 5 4 2002 55 27.5 40 15
#> 6 4 2004 95 47.5 50 45

Created on 2018-09-19 by the reprex package (v0.2.1.9000)

Conditional aggregation based on groups in a data frame R

We could use map to loop over the values used in comparison, then grouped by 'Col1', create the two columns within each loop, by taking the sum of 'Col7' which is less than or equal to the value looped, and the sum of corresponding values of 'Col4', where 'Col7' is less than or equal to the value

library(purrr)
library(dplyr)
map_dfc(c(1, 2, 5, 10), ~ Data_Frame %>%
group_by(Col1) %>%
transmute(!! sprintf("Last%dCol7", .x) := sum(Col7 <= .x),
!! sprintf("Last%dSumCol4Col7", .x) := sum(Col4[Col7<= .x])) %>%
ungroup %>%
select(-Col1)) %>%
bind_cols(Data_Frame, .)

-output

#Col1       Col2       Col3 Col4 Col5      Col6 Col7 Last1Col7 Last1SumCol4Col7 Last2Col7 Last2SumCol4Col7 Last5Col7 Last5SumCol4Col7 Last10Col7
#1 A1 2011-03-11 2018-10-22 4 7 9.7917808 10 0 0 0 0 1 2 3
#2 A1 2014-08-21 2019-05-24 2 6 6.3452055 10 0 0 0 0 1 2 3
#3 A1 2016-01-17 2020-12-25 2 3 4.9371585 5 0 0 0 0 1 2 3
#4 A2 2017-06-30 2018-10-12 1 1 3.4712329 5 0 0 0 0 3 9 3
#5 A2 2018-07-11 2019-09-24 4 3 2.4410959 5 0 0 0 0 3 9 3
#6 A2 2018-11-28 2020-12-19 4 2 2.0575342 5 0 0 0 0 3 9 3
#7 A3 2019-09-04 2018-10-22 4 5 1.2931507 2 2 8 3 12 3 12 3
#8 A3 2020-02-29 2019-06-14 4 1 0.8060109 1 2 8 3 12 3 12 3
#9 A3 2020-07-12 2020-12-20 4 2 0.4410959 1 2 8 3 12 3 12 3
# Last10SumCol4Col7
#1 8
#2 8
#3 8
#4 9
#5 9
#6 9
#7 12
#8 12
#9 12

The issue in OP's code giving wrong sum is because Data_Frame[Data_Frame$Col7 <=2, ] is breaking the group and is getting the whole column subset instead of those within the group. Within tidyverse, we don't need to Data_Frame$, if we need to specify the data, use . or cur_data(). Also, here we just need Col7 <=2

How to aggregate characters strings by group in R?

An option is to group by 'DocID', fill the columns 'ElementA', 'ElementB' with adjacent non-NA elements and get the distinct rows

library(dplyr)
library(tidyr)
df1 %>%
group_by(DocID) %>%
fill(ElementA, ElementB, .direction = "downup") %>%
ungroup %>%
distinct

-output

# A tibble: 3 x 3
# DocID ElementA ElementB
# <int> <chr> <chr>
#1 1 A1 B1
#2 2 A2 B2
#3 3 A3 B3

data

df1 <- structure(list(DocID = c(1L, 1L, 2L, 2L, 3L, 3L), ElementA = c("A1", 
NA, "A2", NA, "A3", NA), ElementB = c(NA, "B1", NA, "B2", NA,
"B3")), class = "data.frame", row.names = c(NA, -6L))



Aggregate and count identical rows in r

In SQL terms, you can count rows grouping by all columns and join the result with the initial data.frame.

I recommend using data.table package.

df=data.frame(a=c(1,1,2,3,4,4,4),b=c("a","a","b","b","e","e","f"))

library(data.table)

# convert df to data.table
df=as.data.table(df)

# aggregate df grouping by all columns
clmns=colnames(df)
row_multiplicity=df[,.N,by=clmns]

#join/merge with initial data.frame
new_df=merge(df,row_multiplicity)

Removing variables with group by or aggregate functions

Your use of dat_o[[..]] within the function(x) is always using the whole frame, not just the subset/group you are intending to do. Also, there is no need to use a for loop, we can use .SDcols. I'll demonstrate with mtcars:

library(data.table)
MT <- as.data.table(mtcars)
cols <- c("hp", "wt", "qsec")
MT[, (cols) := lapply(.SD, function(z) fifelse(z %in% boxplot.stats(z)$out, z[NA], z)),
.SDcols = cols][]
# mpg cyl disp hp drat wt qsec vs am gear carb
# <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
# 1: 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
# 2: 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
# 3: 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
# 4: 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
# 5: 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
# 6: 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
# 7: 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# 8: 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
# 9: 22.8 4 140.8 95 3.92 3.150 NA 1 0 4 2
# 10: 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# 11: 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# 12: 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
# 13: 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
# 14: 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
# 15: 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
# 16: 10.4 8 460.0 215 3.00 NA 17.82 0 0 3 4
# 17: 14.7 8 440.0 230 3.23 NA 17.42 0 0 3 4
# 18: 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
# 19: 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
# 20: 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
# 21: 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
# 22: 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
# 23: 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
# 24: 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
# 25: 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
# 26: 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
# 27: 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
# 28: 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
# 29: 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
# 30: 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# 31: 15.0 8 301.0 NA 3.54 3.570 14.60 0 1 5 8
# 32: 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# mpg cyl disp hp drat wt qsec vs am gear carb

FYI: I used z[NA] instead of just NA because fifelse enforces that both the yes= and no= arguments must be strictly of the same class; an isolated NA is technically of class logical (there are at least six types of NA, fyi), but z[NA] will always return the appropriate class of NA needed to satisfy fifelse. (dplyr::if_else is the same way. I consider base::ifelse a little sloppy ... perhaps more forgiving ... for not enforcing this, though it can lead to surprises if you are not expecting or prepared for it.)

(This methodology can be applied to base or dplyr methods as well.)

Aggregate with adjacent group if value falls below a threshold

I think your example is a bit too minimal for this really challenging question. I added some challenges to your data which I think the approaches of the other answers can't tackle yet. My approach is quite verbose. Essentially, it checks every logical combination / direction in which age buckets could be merged and then recursively merges the age buckets until the threshold is met or until there are no other age buckets left to merge together. With a bit more work we could turn this into a more general function.

library(tidyverse)

demo_data <- as_tibble(VADeaths) %>%
mutate(age_bucket = row.names(VADeaths)) %>%
pivot_longer(-age_bucket) %>%
arrange(name) %>%
# lets add more challenges to the data
mutate(value = case_when(
age_bucket == "55-59" & name == "Rural Female" ~ 2,
age_bucket == "70-74" & name == "Rural Male" ~ 13,
age_bucket == "65-69" & name == "Urban Female" ~ 8,
age_bucket == "70-74" & name == "Urban Male" ~ 3,
T ~ value))

# function that implements merging age buckets
merge_impl <- function(x) {

if(any(x$first)) {
e <- filter(x, first == 1)

if (e$id & !is.na(e$age_max_lead)) {
out <- mutate(x,
age_max = if_else(first,
age_max_lead,
age_max),
value = if_else(first,
value + value_lead,
value))
out <- filter(out, !lag(first, default = FALSE))


} else if (e$id & is.na(e$age_max_lead & !is.na(e$age_min_lag))) {
out <- mutate(x,
age_min = if_else(first,
age_min_lag,
age_min),
value = if_else(first,
value + value_lag,
value))
out <- filter(out, !lead(first, default = FALSE))

} else if (e$id & is.na(e$age_max_lead & is.na(e$age_min_lag))) {
out <- x
} else if (!e$id & !is.na(e$age_min_lag)) {
out <- mutate(x,
age_min = if_else(first,
age_min_lag,
age_min),
value = if_else(first,
value + value_lag,
value))
out <- filter(out, !lead(first, default = FALSE))

} else if (!e$id & is.na(e$age_min_lag) & !is.na(e$age_max_lead)) {
out <- mutate(x,
age_max = if_else(first,
age_max_lead,
age_max),
value = if_else(first,
value + value_lead,
value)) %>%
out <- filter(out, !lag(first, default = FALSE))

} else if (!e$id & is.na(e$age_min_lag) & is.na(e$age_max_lead)) {
out <- x
}
} else {
out <- x
}

select(out,
-contains("lead"), -contains("lag"),
-first, -id)
}

merge_age_buckets <- function(x, threshold) {

# initialize
data_ls <-
x %>%
separate(age_bucket,
c("age_min", "age_max"),
convert = TRUE) %>%
group_by(name) %>%
mutate(across(c(age_min, age_max, value),
list(lead = ~ lead(.x),
lag = ~ lag(.x))
)
) %>%
mutate(id = age_min %% 10 == 0,
first = value < threshold & cumsum(value < threshold) == 1) %>%
group_split

# check & proceed
if(any(map_lgl(data_ls, ~ any(.x$first & nrow(.x) > 1)))) {
res <- map_dfr(data_ls, merge_impl) %>%
mutate(age_bucket = paste0(age_min, "-", age_max)) %>%
select(- c(age_min, age_max))
# if result still needs adjustment repeat
if(any(res$value < threshold)) {
merge_age_buckets(res, threshold = threshold)
} else {
return(res)
}
} else {
out <- reduce(data_ls, bind_rows) %>%
mutate(age_buckets = paste0(age_min, "-", age_max)) %>%
select(- c(age_min, age_max))
return(out)
}
}

merge_age_buckets(demo_data, 15)
#> # A tibble: 13 x 3
#> name value age_bucket
#> <chr> <dbl> <chr>
#> 1 Rural Female 31 50-64
#> 2 Rural Female 30.9 65-69
#> 3 Rural Female 54.3 70-74
#> 4 Rural Male 29.8 50-59
#> 5 Rural Male 26.9 60-64
#> 6 Rural Male 54 65-74
#> 7 Urban Female 22 50-59
#> 8 Urban Female 27.3 60-69
#> 9 Urban Female 50 70-74
#> 10 Urban Male 15.4 50-54
#> 11 Urban Male 24.3 55-59
#> 12 Urban Male 37 60-64
#> 13 Urban Male 57.6 65-74

Created on 2020-06-23 by the reprex package (v0.3.0)



Related Topics



Leave a reply



Submit