Add a new column based on aggregation from other two columns
In base R, we can use ave
to apply a function to every group keeping the number of rows same.
d$y <- with(d, x - ave(x, F))
#Explicitly mentioning the function name
#d$y <- with(d, x - ave(x, F, FUN = mean))
d
# x E F y
#1 22 C A -6.666667
#2 2 C B -3.333333
#3 14 C A -14.666667
#4 3 D B -2.333333
#5 50 D A 21.333333
#6 11 D B 5.666667
ave
has default FUN
argument as mean
.
Aggregate by multiple columns, sum one column and keep other columns? Create new column based on aggregated values?
In data.table
:
library(data.table)
setDT(df)[, .(Amount = sum(Amount, na.rm = TRUE),
UniqueStores = uniqueN(Store, na.rm = TRUE)),
by = .(ProductID, Day, Product)
]
Output:
ProductID Day Product Amount UniqueStores
1: 1 Monday Food 10 1
2: 1 Tuesday Food 10 2
3: 2 Wednesday Toys 15 2
4: 2 Friday Toys 7 1
R aggregate data in one column based on 2 other columns
library(plyr)
#I am using cut
function with 50 breaks for both v1 and v2 and ddply
from plyr package for computing the mean
newdata<-ddply(df,.(cut(v1,50),cut(v2,50)),summarise,mean.v3=mean(v3))
> head(newdata)
cut(v1, 50) cut(v2, 50) mean.v3
1 (-49.4,-47.5] (-34.7,-32.7] 18.123
2 (-49.4,-47.5] (-0.576,1.43] 20.887
3 (-49.4,-47.5] (15.5,17.5] 20.887
4 (-47.5,-45.5] (-52.7,-50.7] 9.918
5 (-47.5,-45.5] (-44.7,-42.7] 14.477
6 (-47.5,-45.5] (-34.7,-32.7] 16.314
Updated as per the comments: If you want the lower, middle and mid-points, you can use the following function or use with details as follow(you need to use the sub
function to deal with (
and ]
):
df$newv1<-with(df,cut(v1,50))
df$newv2<-with(df,cut(v2,50))
df$lowerv1<-with(df,as.numeric( sub("\\((.+),.*", "\\1", newv1))) #lower value
df$upperv1<-with(df,as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", newv1))) # upper value
df$midv1<-with(df,(lowerv1+upperv1)/2) #mid value
df$lowerv2<-with(df,as.numeric( sub("\\((.+),.*", "\\1",newv2))) #lower value
df$upperv2<-with(df,as.numeric( sub("[^,]*,([^]]*)\\]", "\\1", newv2))) # upper value
df$midv2<-with(df,(lowerv2+upperv2)/2)#mid value
newdata<-ddply(df,.(newv1,newv2),transform,mean.v3=mean(v3))
> head(newdata)
v1 v2 v3 newv1 newv2 lowerv1 upperv1 midv1 lowerv2 upperv2 midv2 mean.v3
1 -47.456 -32.714 18.123 (-49.4,-47.5] (-34.7,-32.7] -49.4 -47.5 -48.45 -34.700 -32.70 -33.700 18.123
2 -49.329 -0.465 20.887 (-49.4,-47.5] (-0.576,1.43] -49.4 -47.5 -48.45 -0.576 1.43 0.427 20.887
3 -48.652 16.558 20.800 (-49.4,-47.5] (15.5,17.5] -49.4 -47.5 -48.45 15.500 17.50 16.500 20.887
4 -48.323 17.153 20.974 (-49.4,-47.5] (15.5,17.5] -49.4 -47.5 -48.45 15.500 17.50 16.500 20.887
5 -45.713 -52.599 9.918 (-47.5,-45.5] (-52.7,-50.7] -47.5 -45.5 -46.50 -52.700 -50.70 -51.700 9.918
6 -45.805 -43.071 14.477 (-47.5,-45.5] (-44.7,-42.7] -47.5 -45.5 -46.50 -44.700 -42.70 -43.700 14.477
Aggregate multiple columns at once
We can use the formula method of aggregate
. The variables on the 'rhs' of ~
are the grouping variables while the .
represents all other variables in the 'df1' (from the example, we assume that we need the mean
for all the columns except the grouping), specify the dataset and the function (mean
).
aggregate(.~id1+id2, df1, mean)
Or we can use summarise_each
from dplyr
after grouping (group_by
)
library(dplyr)
df1 %>%
group_by(id1, id2) %>%
summarise_each(funs(mean))
Or using summarise
with across
(dplyr
devel version - ‘0.8.99.9000’
)
df1 %>%
group_by(id1, id2) %>%
summarise(across(starts_with('val'), mean))
Or another option is data.table
. We convert the 'data.frame' to 'data.table' (setDT(df1)
, grouped by 'id1' and 'id2', we loop through the subset of data.table (.SD
) and get the mean
.
library(data.table)
setDT(df1)[, lapply(.SD, mean), by = .(id1, id2)]
data
df1 <- structure(list(id1 = c("a", "a", "a", "a", "b", "b",
"b", "b"
), id2 = c("x", "x", "y", "y", "x", "y", "x", "y"),
val1 = c(1L,
2L, 3L, 4L, 1L, 4L, 3L, 2L), val2 = c(9L, 4L, 5L, 9L, 7L, 4L,
9L, 8L)), .Names = c("id1", "id2", "val1", "val2"),
class = "data.frame", row.names = c("1",
"2", "3", "4", "5", "6", "7", "8"))
Aggregate rows into new column based on common value in another column in R
Before this, a small piece of advice, never name your columns in numeric, it may create you many glitches.
library(tidyverse)
df1 %>% left_join(df2, by = 'NOC') %>%
group_by(GROUP) %>%
mutate(across(c(`2007`, `2008`), ~sum(.), .names = 's.{.col}' ))
# A tibble: 6 x 6
# Groups: GROUP [3]
NOC `2007` `2008` GROUP s.2007 s.2008
<chr> <int> <int> <chr> <int> <int>
1 A 100 5 aa 300 15
2 B 100 5 aa 300 15
3 C 100 5 aa 300 15
4 D 20 2 bb 30 14
5 E 10 12 bb 30 14
6 F 2 1 cc 2 1
aggregate multiple columns in a data frame at once calculating different statistics on different columns - R
We could use dplyr
for flexibility
library(dplyr)
df1 %>%
group_by(name) %>%
summarise(v1 = mean(v1, na.rm = TRUE),
v2 = sd(v2, na.rm = TRUE), v3 = max(v3, na.rm = TRUE),
v4 = sum(v4, na.rm = TRUE))
If there are multiple columns to be blocked for different functions, use across
df1 %>%
group_by(name) %>%
summarise(across(c(v1, v2), mean, na.rm = TRUE),
v3 = sd(v3, na.rm = TRUE),
across(c(v4, v5), sum, na.rm = TRUE))
Or use collap
from collapse
library(collapse)
collap(df1, ~ name, custom = list(fmean = c("v1", "v2"),
fsd = "v3", fsum = c("v4", "v5")))
Aggregate data in one column based on values/factors in four another columns
Use the dplyr package in R.
species.prop %>% group_by(species, area, month, year) %>% summarise(catch.p = sum(catch.p))
With the data given, the outcome looks like
Source: local data frame [6 x 5]
Groups: species, area, month [?]
species area month year catch.p
(fctr) (fctr) (dbl) (dbl) (dbl)
1 hom IV 4 1998 27.60
2 hom VI 4 1998 17.50
3 hom VI 4 2000 40.25
4 pil VI 4 1998 8.05
5 pil VII 1 2000 46.00
6 pil VII 5 1998 17.50
But to show you how this works, I changed row 6 in the original data submitted to Area = VII and month = 5, and it looks like
Source: local data frame [5 x 5]
Groups: species, area, month [?]
species area month year catch.p
(fctr) (fctr) (dbl) (dbl) (dbl)
1 hom IV 4 1998 27.60
2 hom VI 4 1998 17.50
3 hom VI 4 2000 40.25
4 pil VII 1 2000 46.00
5 pil VII 5 1998 25.55
How can you aggregate a single column based all the other columns?
If you want to do it without creating the Freq=1 variable, you can create that "on the fly":
> head(data)
ONE TWO THREE
1 D D C
2 A B C
3 C B D
4 A A D
5 A B A
> aggregate(rep(1,nrow(data)) ~ ., data=data, sum)
ONE TWO THREE rep(1, nrow(data))
1 B A A 1
2 C A A 3
3 D A A 2
4 A B A 1
5 D B A 1
But it does give a rough name to the column. Wrap it in setNames
:
> setNames(aggregate(rep(1,nrow(data)) ~ ., data=data, sum), c(names(data),"Freq"))
ONE TWO THREE Freq
1 B A A 1
2 C A A 3
3 D A A 2
4 A B A 1
5 D B A 1
Related Topics
R Column Check If Contains Value from Another Column
Gathering Wide Columns into Multiple Long Columns Using Pivot_Longer
Circular Heatmap That Looks Like a Donut
Comparison Between Dplyr::Do/Purrr::Map, What Advantages
How to Convert Utm Coordinates to Lat and Long in R
R - Common Title and Legend for Combined Plots
Aggregating Sub Totals and Grand Totals with Data.Table
Dplyr Join Warning: Joining Factors with Different Levels
R Aggregate Data in One Column Based on 2 Other Columns
How to Calculate the Average of a Variable Between Two Date Ranges Using a Loop or Apply Function
How and When Should I Use On.Exit
Piecewise Regression with R: Plotting the Segments
Download Attachment from an Outlook Email Using R
How to Add a Scale Bar (For Linear Distances) to Ggmap