R Aggregate Data in One Column Based on 2 Other Columns

Add a new column based on aggregation from other two columns

In base R, we can use ave to apply a function to every group keeping the number of rows same.

d$y <- with(d, x - ave(x, F))
#Explicitly mentioning the function name
#d$y <- with(d, x - ave(x, F, FUN = mean))
d

#   x E F          y
#1 22 C A  -6.666667
#2  2 C B  -3.333333
#3 14 C A -14.666667
#4  3 D B  -2.333333
#5 50 D A  21.333333
#6 11 D B   5.666667

ave has default FUN argument as mean.

Aggregate by multiple columns, sum one column and keep other columns? Create new column based on aggregated values?

In data.table:

library(data.table)

setDT(df)[, .(Amount = sum(Amount, na.rm = TRUE),
              UniqueStores = uniqueN(Store, na.rm = TRUE)), 
          by = .(ProductID, Day, Product)
          ]

Output:

   ProductID       Day Product Amount UniqueStores
1:         1    Monday    Food     10            1
2:         1   Tuesday    Food     10            2
3:         2 Wednesday    Toys     15            2
4:         2    Friday    Toys      7            1

R aggregate data in one column based on 2 other columns

library(plyr)

#I am using cut function with 50 breaks for both v1 and v2 and ddply from plyr package for computing the mean

newdata<-ddply(df,.(cut(v1,50),cut(v2,50)),summarise,mean.v3=mean(v3))
    > head(newdata)
        cut(v1, 50)   cut(v2, 50) mean.v3
    1 (-49.4,-47.5] (-34.7,-32.7]  18.123
    2 (-49.4,-47.5] (-0.576,1.43]  20.887
    3 (-49.4,-47.5]   (15.5,17.5]  20.887
    4 (-47.5,-45.5] (-52.7,-50.7]   9.918
    5 (-47.5,-45.5] (-44.7,-42.7]  14.477
    6 (-47.5,-45.5] (-34.7,-32.7]  16.314

Updated as per the comments: If you want the lower, middle and mid-points, you can use the following function or use with details as follow(you need to use the sub function to deal with ( and ]):

    df$newv1<-with(df,cut(v1,50)) 
    df$newv2<-with(df,cut(v2,50))
    df$lowerv1<-with(df,as.numeric( sub("\\((.+),.*", "\\1", newv1))) #lower value
    df$upperv1<-with(df,as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", newv1))) # upper value
    df$midv1<-with(df,(lowerv1+upperv1)/2) #mid value
    df$lowerv2<-with(df,as.numeric( sub("\\((.+),.*", "\\1",newv2))) #lower value
    df$upperv2<-with(df,as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", newv2))) # upper value
    df$midv2<-with(df,(lowerv2+upperv2)/2)#mid value
    newdata<-ddply(df,.(newv1,newv2),transform,mean.v3=mean(v3))

   > head(newdata)
       v1      v2     v3         newv1         newv2 lowerv1 upperv1  midv1 lowerv2 upperv2   midv2 mean.v3
1 -47.456 -32.714 18.123 (-49.4,-47.5] (-34.7,-32.7]   -49.4   -47.5 -48.45 -34.700  -32.70 -33.700  18.123
2 -49.329  -0.465 20.887 (-49.4,-47.5] (-0.576,1.43]   -49.4   -47.5 -48.45  -0.576    1.43   0.427  20.887
3 -48.652  16.558 20.800 (-49.4,-47.5]   (15.5,17.5]   -49.4   -47.5 -48.45  15.500   17.50  16.500  20.887
4 -48.323  17.153 20.974 (-49.4,-47.5]   (15.5,17.5]   -49.4   -47.5 -48.45  15.500   17.50  16.500  20.887
5 -45.713 -52.599  9.918 (-47.5,-45.5] (-52.7,-50.7]   -47.5   -45.5 -46.50 -52.700  -50.70 -51.700   9.918
6 -45.805 -43.071 14.477 (-47.5,-45.5] (-44.7,-42.7]   -47.5   -45.5 -46.50 -44.700  -42.70 -43.700  14.477

Aggregate multiple columns at once

We can use the formula method of aggregate. The variables on the 'rhs' of ~ are the grouping variables while the . represents all other variables in the 'df1' (from the example, we assume that we need the mean for all the columns except the grouping), specify the dataset and the function (mean).

aggregate(.~id1+id2, df1, mean)

Or we can use summarise_each from dplyr after grouping (group_by)

library(dplyr)
df1 %>%
    group_by(id1, id2) %>% 
    summarise_each(funs(mean))

Or using summarise with across (dplyr devel version - ‘0.8.99.9000’)

df1 %>% 
    group_by(id1, id2) %>%
    summarise(across(starts_with('val'), mean))

Or another option is data.table. We convert the 'data.frame' to 'data.table' (setDT(df1), grouped by 'id1' and 'id2', we loop through the subset of data.table (.SD) and get the mean.

library(data.table)
setDT(df1)[, lapply(.SD, mean), by = .(id1, id2)]

data

df1 <- structure(list(id1 = c("a", "a", "a", "a", "b", "b", 
"b", "b"
), id2 = c("x", "x", "y", "y", "x", "y", "x", "y"), 
val1 = c(1L, 
2L, 3L, 4L, 1L, 4L, 3L, 2L), val2 = c(9L, 4L, 5L, 9L, 7L, 4L, 
9L, 8L)), .Names = c("id1", "id2", "val1", "val2"), 
class = "data.frame", row.names = c("1", 
"2", "3", "4", "5", "6", "7", "8"))

Aggregate rows into new column based on common value in another column in R

Before this, a small piece of advice, never name your columns in numeric, it may create you many glitches.

library(tidyverse)

df1 %>% left_join(df2, by = 'NOC') %>%
  group_by(GROUP) %>%
  mutate(across(c(`2007`, `2008`), ~sum(.), .names = 's.{.col}' ))

# A tibble: 6 x 6
# Groups:   GROUP [3]
  NOC   `2007` `2008` GROUP s.2007 s.2008
  <chr>  <int>  <int> <chr>  <int>  <int>
1 A        100      5 aa       300     15
2 B        100      5 aa       300     15
3 C        100      5 aa       300     15
4 D         20      2 bb        30     14
5 E         10     12 bb        30     14
6 F          2      1 cc         2      1

aggregate multiple columns in a data frame at once calculating different statistics on different columns - R

We could use dplyr for flexibility

library(dplyr)
df1 %>%
     group_by(name) %>% 
     summarise(v1 = mean(v1, na.rm = TRUE),
    v2 = sd(v2, na.rm = TRUE), v3 = max(v3, na.rm = TRUE),
            v4 = sum(v4, na.rm = TRUE))

If there are multiple columns to be blocked for different functions, use across

df1 %>%
    group_by(name) %>%
    summarise(across(c(v1, v2), mean, na.rm = TRUE),
               v3 = sd(v3, na.rm = TRUE),
               across(c(v4, v5), sum, na.rm = TRUE))

Or use collap from collapse

library(collapse)
collap(df1, ~ name, custom = list(fmean = c("v1", "v2"),
      fsd = "v3", fsum = c("v4", "v5")))

Aggregate data in one column based on values/factors in four another columns

Use the dplyr package in R.

species.prop %>% group_by(species, area, month, year) %>% summarise(catch.p = sum(catch.p))

With the data given, the outcome looks like

Source: local data frame [6 x 5]
Groups: species, area, month [?]

  species   area month  year catch.p
   (fctr) (fctr) (dbl) (dbl)   (dbl)
1     hom     IV     4  1998   27.60
2     hom     VI     4  1998   17.50
3     hom     VI     4  2000   40.25
4     pil     VI     4  1998    8.05
5     pil    VII     1  2000   46.00
6     pil    VII     5  1998   17.50

But to show you how this works, I changed row 6 in the original data submitted to Area = VII and month = 5, and it looks like

Source: local data frame [5 x 5]
Groups: species, area, month [?]

  species   area month  year catch.p
   (fctr) (fctr) (dbl) (dbl)   (dbl)
1     hom     IV     4  1998   27.60
2     hom     VI     4  1998   17.50
3     hom     VI     4  2000   40.25
4     pil    VII     1  2000   46.00
5     pil    VII     5  1998   25.55

How can you aggregate a single column based all the other columns?

If you want to do it without creating the Freq=1 variable, you can create that "on the fly":

> head(data)
  ONE TWO THREE
1   D   D     C
2   A   B     C
3   C   B     D
4   A   A     D
5   A   B     A

> aggregate(rep(1,nrow(data)) ~ ., data=data, sum)

   ONE TWO THREE rep(1, nrow(data))
1    B   A     A                  1
2    C   A     A                  3
3    D   A     A                  2
4    A   B     A                  1
5    D   B     A                  1

But it does give a rough name to the column. Wrap it in setNames:

> setNames(aggregate(rep(1,nrow(data)) ~ ., data=data, sum), c(names(data),"Freq"))
   ONE TWO THREE Freq
1    B   A     A    1
2    C   A     A    3
3    D   A     A    2
4    A   B     A    1
5    D   B     A    1