R Group by Aggregate

How to sum a variable by group

Using aggregate:

aggregate(x$Frequency, by=list(Category=x$Category), FUN=sum)
  Category  x
1    First 30
2   Second  5
3    Third 34

In the example above, multiple dimensions can be specified in the list. Multiple aggregated metrics of the same data type can be incorporated via cbind:

aggregate(cbind(x$Frequency, x$Metric2, x$Metric3) ...

(embedding @thelatemail comment), aggregate has a formula interface too

aggregate(Frequency ~ Category, x, sum)

Or if you want to aggregate multiple columns, you could use the . notation (works for one column too)

aggregate(. ~ Category, x, sum)

or tapply:

tapply(x$Frequency, x$Category, FUN=sum)
 First Second  Third 
    30      5     34

Using this data:

x <- data.frame(Category=factor(c("First", "First", "First", "Second",
                                      "Third", "Third", "Second")), 
                    Frequency=c(10,15,5,2,14,20,3))

Using aggregate/group_by in R to group data and give a count for each factor variable?

With dpylr::count and tidyr::pivot_wider you could do:

library(dplyr)
library(tidyr)

telangiectasia_tumour_data %>% 
  count(Telangiectasia_time, grade) %>% 
  pivot_wider(names_from = grade, values_from = n, names_prefix = "grade", values_fill = 0)
#> # A tibble: 4 × 3
#>   Telangiectasia_time      grade0 grade1
#>   <chr>                     <int>  <int>
#> 1 telangiectasia_tumour_0       1      1
#> 2 telangiectasia_tumour_1       1      1
#> 3 telangiectasia_tumour_12      1      0
#> 4 telangiectasia_tumour_24      1      0

DATA

telangiectasia_tumour_data <- structure(list(Telangiectasia_time = c(
  "telangiectasia_tumour_0",
  "telangiectasia_tumour_1", "telangiectasia_tumour_12", "telangiectasia_tumour_24",
  "telangiectasia_tumour_0", "telangiectasia_tumour_1"
), grade = c(
  0L,
  0L, 0L, 0L, 1L, 1L
)), class = "data.frame", row.names = c(
  "1",
  "2", "3", "4", "5", "6"
))

R group by aggregate

Here's my solution using aggregate.

First, load the data:

df <- read.table(text = 
"SessionID   Price
'1'       '624.99'
'1'       '697.99'
'1'       '649.00'
'7'       '779.00'
'7'       '710.00'
'7'       '2679.50'", header = TRUE)

Then aggregate and match it back to the original data.frame:

tmp <- aggregate(Price ~ SessionID, df, function(x) c(Min = min(x), Max = max(x)))
df <- cbind(df, tmp[match(df$SessionID, tmp$SessionID), 2])
print(df)
#  SessionID   Price    Min     Max
#1         1  624.99 624.99  697.99
#2         1  697.99 624.99  697.99
#3         1  649.00 624.99  697.99
#4         7  779.00 710.00 2679.50
#5         7  710.00 710.00 2679.50
#6         7 2679.50 710.00 2679.50

EDIT: As per the comment below, you might wonder why this works. It indeed is somewhat weird. But remember that a data.frame just is a fancy list. Try to call str(tmp), and you'll see that the Price column itself is 2 by 2 numeric matrix. It gets confusing as the print.data.frame knows how to handle this and so print(tmp) looks like there are 3 columns. Anyway, tmp[2] simply access the second column/entry of the data.frame/list and returns that 1 column data.frame while tmp[,2] access the second column and return the data type stored.

R sum a variable by two groups

You can group_by ID and Year then use sum within summarise

library(dplyr)

txt <- "ID Year Amount
3 2000 45
3 2000 55
3 2002 10
3 2002 10
3 2004 30
4 2000 25
4 2002 40
4 2002 15
4 2004 45
4 2004 50"

df <- read.table(text = txt, header = TRUE)

df %>% 
  group_by(ID, Year) %>% 
  summarise(Total = sum(Amount, na.rm = TRUE))
#> # A tibble: 6 x 3
#> # Groups:   ID [?]
#>      ID  Year Total
#>   <int> <int> <int>
#> 1     3  2000   100
#> 2     3  2002    20
#> 3     3  2004    30
#> 4     4  2000    25
#> 5     4  2002    55
#> 6     4  2004    95

If you have more than one Amount column & want to apply more than one function, you can use either summarise_if or summarise_all

df %>% 
  group_by(ID, Year) %>% 
  summarise_if(is.numeric, funs(sum, mean))
#> # A tibble: 6 x 4
#> # Groups:   ID [?]
#>      ID  Year   sum  mean
#>   <int> <int> <int> <dbl>
#> 1     3  2000   100  50  
#> 2     3  2002    20  10  
#> 3     3  2004    30  30  
#> 4     4  2000    25  25  
#> 5     4  2002    55  27.5
#> 6     4  2004    95  47.5

df %>% 
  group_by(ID, Year) %>% 
  summarise_all(funs(sum, mean, max, min))
#> # A tibble: 6 x 6
#> # Groups:   ID [?]
#>      ID  Year   sum  mean   max   min
#>   <int> <int> <int> <dbl> <dbl> <dbl>
#> 1     3  2000   100  50      55    45
#> 2     3  2002    20  10      10    10
#> 3     3  2004    30  30      30    30
#> 4     4  2000    25  25      25    25
#> 5     4  2002    55  27.5    40    15
#> 6     4  2004    95  47.5    50    45

^{Created on 2018-09-19 by the reprex package (v0.2.1.9000)}

Conditional aggregation based on groups in a data frame R

We could use map to loop over the values used in comparison, then grouped by 'Col1', create the two columns within each loop, by taking the sum of 'Col7' which is less than or equal to the value looped, and the sum of corresponding values of 'Col4', where 'Col7' is less than or equal to the value

library(purrr)
library(dplyr)
map_dfc(c(1, 2, 5, 10), ~ Data_Frame %>%
     group_by(Col1) %>% 
     transmute(!! sprintf("Last%dCol7", .x) := sum(Col7 <= .x),
               !! sprintf("Last%dSumCol4Col7", .x) := sum(Col4[Col7<= .x])) %>% 
     ungroup %>%
     select(-Col1)) %>% 
 bind_cols(Data_Frame, .)

-output

#Col1       Col2       Col3 Col4 Col5      Col6 Col7 Last1Col7 Last1SumCol4Col7 Last2Col7 Last2SumCol4Col7 Last5Col7 Last5SumCol4Col7 Last10Col7
#1   A1 2011-03-11 2018-10-22    4    7 9.7917808   10         0                0         0                0         1                2          3
#2   A1 2014-08-21 2019-05-24    2    6 6.3452055   10         0                0         0                0         1                2          3
#3   A1 2016-01-17 2020-12-25    2    3 4.9371585    5         0                0         0                0         1                2          3
#4   A2 2017-06-30 2018-10-12    1    1 3.4712329    5         0                0         0                0         3                9          3
#5   A2 2018-07-11 2019-09-24    4    3 2.4410959    5         0                0         0                0         3                9          3
#6   A2 2018-11-28 2020-12-19    4    2 2.0575342    5         0                0         0                0         3                9          3
#7   A3 2019-09-04 2018-10-22    4    5 1.2931507    2         2                8         3               12         3               12          3
#8   A3 2020-02-29 2019-06-14    4    1 0.8060109    1         2                8         3               12         3               12          3
#9   A3 2020-07-12 2020-12-20    4    2 0.4410959    1         2                8         3               12         3               12          3
#  Last10SumCol4Col7
#1                 8
#2                 8
#3                 8
#4                 9
#5                 9
#6                 9
#7                12
#8                12
#9                12

The issue in OP's code giving wrong sum is because Data_Frame[Data_Frame$Col7 <=2, ] is breaking the group and is getting the whole column subset instead of those within the group. Within tidyverse, we don't need to Data_Frame$, if we need to specify the data, use . or cur_data(). Also, here we just need Col7 <=2

How to aggregate characters strings by group in R?

An option is to group by 'DocID', fill the columns 'ElementA', 'ElementB' with adjacent non-NA elements and get the distinct rows

library(dplyr)
library(tidyr)
df1 %>%
   group_by(DocID) %>%
   fill(ElementA, ElementB, .direction = "downup") %>%
   ungroup %>%
   distinct

-output

# A tibble: 3 x 3
#  DocID ElementA ElementB
#  <int> <chr>    <chr>   
#1     1 A1       B1      
#2     2 A2       B2      
#3     3 A3       B3

data

df1 <- structure(list(DocID = c(1L, 1L, 2L, 2L, 3L, 3L), ElementA = c("A1", 
NA, "A2", NA, "A3", NA), ElementB = c(NA, "B1", NA, "B2", NA, 
"B3")), class = "data.frame", row.names = c(NA, -6L))

Aggregate and count identical rows in r

In SQL terms, you can count rows grouping by all columns and join the result with the initial data.frame.

I recommend using data.table package.

df=data.frame(a=c(1,1,2,3,4,4,4),b=c("a","a","b","b","e","e","f"))

library(data.table)

# convert df to data.table
df=as.data.table(df)

# aggregate df grouping by all columns
clmns=colnames(df)
row_multiplicity=df[,.N,by=clmns]

#join/merge with initial data.frame
new_df=merge(df,row_multiplicity)

Removing variables with group by or aggregate functions

Your use of dat_o[[..]] within the function(x) is always using the whole frame, not just the subset/group you are intending to do. Also, there is no need to use a for loop, we can use .SDcols. I'll demonstrate with mtcars:

library(data.table)
MT <- as.data.table(mtcars)
cols <- c("hp", "wt", "qsec")
MT[, (cols) := lapply(.SD, function(z) fifelse(z %in% boxplot.stats(z)$out, z[NA], z)),
    .SDcols = cols][]
#       mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb
#     <num> <num> <num> <num> <num> <num> <num> <num> <num> <num> <num>
#  1:  21.0     6 160.0   110  3.90 2.620 16.46     0     1     4     4
#  2:  21.0     6 160.0   110  3.90 2.875 17.02     0     1     4     4
#  3:  22.8     4 108.0    93  3.85 2.320 18.61     1     1     4     1
#  4:  21.4     6 258.0   110  3.08 3.215 19.44     1     0     3     1
#  5:  18.7     8 360.0   175  3.15 3.440 17.02     0     0     3     2
#  6:  18.1     6 225.0   105  2.76 3.460 20.22     1     0     3     1
#  7:  14.3     8 360.0   245  3.21 3.570 15.84     0     0     3     4
#  8:  24.4     4 146.7    62  3.69 3.190 20.00     1     0     4     2
#  9:  22.8     4 140.8    95  3.92 3.150    NA     1     0     4     2
# 10:  19.2     6 167.6   123  3.92 3.440 18.30     1     0     4     4
# 11:  17.8     6 167.6   123  3.92 3.440 18.90     1     0     4     4
# 12:  16.4     8 275.8   180  3.07 4.070 17.40     0     0     3     3
# 13:  17.3     8 275.8   180  3.07 3.730 17.60     0     0     3     3
# 14:  15.2     8 275.8   180  3.07 3.780 18.00     0     0     3     3
# 15:  10.4     8 472.0   205  2.93 5.250 17.98     0     0     3     4
# 16:  10.4     8 460.0   215  3.00    NA 17.82     0     0     3     4
# 17:  14.7     8 440.0   230  3.23    NA 17.42     0     0     3     4
# 18:  32.4     4  78.7    66  4.08 2.200 19.47     1     1     4     1
# 19:  30.4     4  75.7    52  4.93 1.615 18.52     1     1     4     2
# 20:  33.9     4  71.1    65  4.22 1.835 19.90     1     1     4     1
# 21:  21.5     4 120.1    97  3.70 2.465 20.01     1     0     3     1
# 22:  15.5     8 318.0   150  2.76 3.520 16.87     0     0     3     2
# 23:  15.2     8 304.0   150  3.15 3.435 17.30     0     0     3     2
# 24:  13.3     8 350.0   245  3.73 3.840 15.41     0     0     3     4
# 25:  19.2     8 400.0   175  3.08 3.845 17.05     0     0     3     2
# 26:  27.3     4  79.0    66  4.08 1.935 18.90     1     1     4     1
# 27:  26.0     4 120.3    91  4.43 2.140 16.70     0     1     5     2
# 28:  30.4     4  95.1   113  3.77 1.513 16.90     1     1     5     2
# 29:  15.8     8 351.0   264  4.22 3.170 14.50     0     1     5     4
# 30:  19.7     6 145.0   175  3.62 2.770 15.50     0     1     5     6
# 31:  15.0     8 301.0    NA  3.54 3.570 14.60     0     1     5     8
# 32:  21.4     4 121.0   109  4.11 2.780 18.60     1     1     4     2
#       mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb

FYI: I used z[NA] instead of just NA because fifelse enforces that both the yes= and no= arguments must be strictly of the same class; an isolated NA is technically of class logical (there are at least six types of NA, fyi), but z[NA] will always return the appropriate class of NA needed to satisfy fifelse. (dplyr::if_else is the same way. I consider base::ifelse a little sloppy ... perhaps more forgiving ... for not enforcing this, though it can lead to surprises if you are not expecting or prepared for it.)

(This methodology can be applied to base or dplyr methods as well.)

Aggregate with adjacent group if value falls below a threshold

I think your example is a bit too minimal for this really challenging question. I added some challenges to your data which I think the approaches of the other answers can't tackle yet. My approach is quite verbose. Essentially, it checks every logical combination / direction in which age buckets could be merged and then recursively merges the age buckets until the threshold is met or until there are no other age buckets left to merge together. With a bit more work we could turn this into a more general function.

library(tidyverse)

demo_data <- as_tibble(VADeaths) %>% 
  mutate(age_bucket = row.names(VADeaths)) %>% 
  pivot_longer(-age_bucket) %>% 
  arrange(name) %>% 
  # lets add more challenges to the data
  mutate(value = case_when(
    age_bucket == "55-59" & name == "Rural Female" ~ 2,
    age_bucket == "70-74" & name == "Rural Male" ~ 13,
    age_bucket == "65-69" & name == "Urban Female" ~ 8,
    age_bucket == "70-74" & name == "Urban Male" ~ 3,
    T ~ value))

# function that implements merging age buckets
merge_impl <- function(x) {
  
  if(any(x$first)) {
    e <- filter(x, first == 1)
    
    if (e$id & !is.na(e$age_max_lead)) {
      out <- mutate(x,
                     age_max = if_else(first,
                                       age_max_lead,
                                       age_max),
                     value = if_else(first,
                                     value + value_lead,
                                     value)) 
      out <- filter(out, !lag(first, default = FALSE))

      
    } else if (e$id & is.na(e$age_max_lead & !is.na(e$age_min_lag))) {
      out <- mutate(x,
                     age_min = if_else(first,
                                       age_min_lag,
                                       age_min),
                     value = if_else(first,
                                     value + value_lag,
                                     value))
      out <- filter(out, !lead(first, default = FALSE))
      
    } else if (e$id & is.na(e$age_max_lead & is.na(e$age_min_lag))) {
      out <- x
    } else if (!e$id & !is.na(e$age_min_lag)) {
      out <- mutate(x,
                     age_min = if_else(first,
                                       age_min_lag,
                                       age_min),
                     value = if_else(first,
                                     value + value_lag,
                                     value)) 
      out <- filter(out, !lead(first, default = FALSE))

    } else if (!e$id & is.na(e$age_min_lag) & !is.na(e$age_max_lead)) {
      out <- mutate(x,
                     age_max = if_else(first,
                                       age_max_lead,
                                       age_max),
                     value = if_else(first,
                                     value + value_lead,
                                     value)) %>% 
        out <- filter(out, !lag(first, default = FALSE))

    } else if (!e$id & is.na(e$age_min_lag) & is.na(e$age_max_lead)) {
      out <- x
    }
  } else { 
    out <- x
  }

  select(out,
         -contains("lead"), -contains("lag"),
         -first, -id)
}

merge_age_buckets <- function(x, threshold) {
  
  # initialize
  data_ls <-
    x %>% 
    separate(age_bucket,
             c("age_min", "age_max"),
             convert = TRUE) %>% 
    group_by(name) %>% 
    mutate(across(c(age_min, age_max, value),
                    list(lead = ~ lead(.x),
                         lag  = ~ lag(.x))
                   )
    ) %>% 
    mutate(id = age_min %% 10 == 0,
           first = value < threshold & cumsum(value < threshold) == 1) %>% 
    group_split 

   # check & proceed
   if(any(map_lgl(data_ls, ~ any(.x$first & nrow(.x) > 1)))) {
     res <- map_dfr(data_ls, merge_impl) %>% 
       mutate(age_bucket = paste0(age_min, "-", age_max)) %>% 
       select(- c(age_min, age_max))
     # if result still needs adjustment repeat
     if(any(res$value < threshold)) {
       merge_age_buckets(res, threshold = threshold)
     } else {
       return(res)
       }
   } else {
     out <- reduce(data_ls, bind_rows) %>% 
       mutate(age_buckets = paste0(age_min, "-", age_max)) %>% 
       select(- c(age_min, age_max))
     return(out)
   }
}
 
merge_age_buckets(demo_data, 15)
#> # A tibble: 13 x 3
#>    name         value age_bucket
#>    <chr>        <dbl> <chr>     
#>  1 Rural Female  31   50-64     
#>  2 Rural Female  30.9 65-69     
#>  3 Rural Female  54.3 70-74     
#>  4 Rural Male    29.8 50-59     
#>  5 Rural Male    26.9 60-64     
#>  6 Rural Male    54   65-74     
#>  7 Urban Female  22   50-59     
#>  8 Urban Female  27.3 60-69     
#>  9 Urban Female  50   70-74     
#> 10 Urban Male    15.4 50-54     
#> 11 Urban Male    24.3 55-59     
#> 12 Urban Male    37   60-64     
#> 13 Urban Male    57.6 65-74

^{Created on 2020-06-23 by the reprex package (v0.3.0)}