Compute Mean and Standard Deviation by Group For Multiple Variables in a Data.Frame

How to calculate means and standard deviations for multiple grouped variables?

If it is only three sets of columns, can use across with rowMeans or rowSds

library(dplyr)
library(matrixStats)
df %>%
mutate(AirTempMean = rowMeans(across(starts_with("AirTemp")), na.rm = TRUE),
AirTempSD = rowSds(as.matrix(across(starts_with("AirTemp")))),
AirHumidityMean = rowMeans(across(starts_with("AirHumidity")), na.rm = TRUE),
AirHumiditySD = rowSds(as.matrix(across(starts_with("AirHumidity")))),
PrecipitationMean = rowMeans(across(starts_with("Precipitation")), na.rm = TRUE),
PrecipitationSD = rowSds(as.matrix(across(starts_with("Precipitation"))))

)

If there are many sets of columns, an option is to reshape to 'long' format and then do the mean/sd by row and then bind with the original dataset

library(tidyr)
df %>%
select(-SiteID) %>%
mutate(rn = row_number()) %>%
pivot_longer(cols= -rn, names_to = c(".value", "pval"),
names_pattern = "(.*)(P\\d+$)") %>%
group_by(rn) %>%
summarise(across(where(is.numeric),
list(Mean = ~ mean(., na.rm = TRUE),
SD = ~ sd(., na.rm = TRUE)))) %>%
select(-rn) %>%
bind_cols(df, .)

-output

  SiteID AirTempP1 AirTempP2 AirTempP3 AirHumidityP1 AirHumidityP2 AirHumidityP3 PrecipitationP1 PrecipitationP2
1 KIIXB6808G 21.73691 24.96523 10.93523 12.84111 92.93506 82.79740 64.92664 70.736212
2 KIIXB6808G 29.42160 25.14421 16.28761 45.63732 56.82373 78.49595 92.42202 58.547199
3 KIIXB6808G 14.43153 17.56969 13.03869 33.29144 90.66400 43.75959 32.32953 96.171349
4 KIIXB6808G 24.35017 17.85779 11.20442 93.95450 18.58702 39.93221 26.69628 81.723180
5 KIIXB6808G 14.80084 29.38776 29.19315 94.70336 95.89065 25.89645 26.95639 28.048125
6 KIIXB6808G 27.88281 14.29717 10.24926 54.96979 83.53267 78.09418 76.69248 72.712109
7 KIIXB6808G 12.77562 22.11161 28.05708 33.02382 54.44677 20.95251 72.94213 93.959692
8 KIIXB6808G 14.85165 20.22299 10.78721 66.59833 31.77392 26.85253 95.13469 54.235009
9 KIIXB6808G 14.42898 27.83384 17.09562 53.95661 52.25697 71.31224 85.97124 23.399866
10 KIIXB6808G 12.87398 18.36380 20.59257 12.67498 53.06563 17.63772 50.60992 6.751882
PrecipitationP3 AirTemp_Mean AirTemp_SD AirHumidity_Mean AirHumidity_SD Precipitation_Mean Precipitation_SD
1 88.71080 19.21246 7.347780 62.85786 43.61134 74.79122 12.39975
2 81.26882 23.61781 6.698725 60.31900 16.70584 77.41268 17.26350
3 83.31252 15.01330 2.320849 55.90501 30.55382 70.60447 33.76486
4 31.51854 17.80413 6.573037 50.82457 38.84645 46.64600 30.47327
5 60.96926 24.46059 8.366151 72.16349 40.07283 38.65792 19.32989
6 42.94284 17.47641 9.236680 72.19888 15.16659 64.11581 18.44402
7 19.50466 20.98143 7.703164 36.14103 16.96332 62.13549 38.38587
8 34.00365 15.28728 4.732951 41.74159 21.66675 61.12445 31.14241
9 52.38518 19.78615 7.095897 59.17527 10.54522 53.91876 31.31386
10 91.18074 17.27678 3.972451 27.79278 22.02714 49.51418 42.22509

mean and standard deviation by group for multiple variables

The function you will likely want to apply to your dataframe is aggregate() with either mean or sd as the function parameter.

Getting mean and standard deviation from groups in a data.frame

Assuming your data is in a data.frame called DF:

by(DF$HR,DF$Group,mean)

# DF$Group: 1AI
# [1] 276
# -------------------------------------------------------------------------------------------------------------------------------------------------------------
# DF$Group: 1AS
# [1] 246.7692
# -------------------------------------------------------------------------------------------------------------------------------------------------------------
# DF$Group: 1CI
# [1] 217.625
# -------------------------------------------------------------------------------------------------------------------------------------------------------------
# DF$Group: 1CS
# [1] 227.25

by(DF$HR,DF$Group,sd)

# DF$Group: 1AI
# [1] 30.93946
# -------------------------------------------------------------------------------------------------------------------------------------------------------------
# DF$Group: 1AS
# [1] 36.48551
# -------------------------------------------------------------------------------------------------------------------------------------------------------------
# DF$Group: 1CI
# [1] 23.25595
# -------------------------------------------------------------------------------------------------------------------------------------------------------------
# DF$Group: 1CS
# [1] 25.77236

Mean and standard deviation with multiple dataframes

Use concat with remove D in DataFrame.query and aggregate by GroupBy.agg with named aggregations:

df = (pd.concat([df1, df2, df3])
.query('ID != "D"')
.groupby('ID')
.agg(avg=('Amount', 'mean'), std=('Amount', 'std')))
print (df)
avg std
ID
A 5 3.605551
B 1 1.000000
C 2 1.000000

Or remove D in last step by DataFrame.drop:

df = (pd.concat([df1, df2, df3])
.groupby('ID')
.agg(avg=('Amount', 'mean'), std=('Amount', 'std'))
.drop('D'))

Calculate standard deviation for groups of values using Python

You can use groupby(['name']) on the full data frame first, and only apply the agg on the columns of interest:

data = pd.DataFrame({'name':['AAA','AAA','BBB','BBB','CCC','CCC','CCC'],
'number':[10,20,1,2,5,10,10.5],
'difference':[0,10,0,1,0,5,0.5]})
data.groupby(['name'])['difference'].agg(['mean', 'std'])

Mean and standard deviation NOT by group for multiple variables

Assuming that you want to mean, sd for columns v1:v4.
Using base R

 f1 <- function(x,...){c(mean(x,...), sd(x))}
apply(df[,-1],2, f1, na.rm=TRUE)
# v1 v2 v3 v4
#[1,] 16.500000 23.00000 25.50000 12.000000
#[2,] 9.192388 15.55635 19.09188 2.828427

sapply(df[,-1],f1)
# v1 v2 v3 v4
#[1,] 16.500000 23.00000 25.50000 12.000000
#[2,] 9.192388 15.55635 19.09188 2.828427

aggregate(.~1, df[,-1], f1, na.rm=TRUE, na.action=NULL)
# v1.1 v1.2 v2.1 v2.2 v3.1 v3.2 v4.1 v4.2
#1 16.500000 9.192388 23.00000 15.55635 25.50000 19.09188 12.000000 2.828427

Or

 library(dplyr)
summarise_each(df,funs(mean=mean(., na.rm=TRUE), sd), starts_with("v"))
# v1_mean v2_mean v3_mean v4_mean v1_sd v2_sd v3_sd v4_sd
#1 16.5 23 25.5 12 9.192388 15.55635 19.09188 2.828427

Or using data.table

library(data.table)
setDT(df)[,lapply(.SD, function(x) c(mean(x, na.rm=TRUE), sd(x))),
.SDcols=paste0('v', 1:4)]
# v1 v2 v3 v4
#1: 16.500000 23.00000 25.50000 12.000000
#2: 9.192388 15.55635 19.09188 2.828427

data

df <- structure(list(id = 1:2, v1 = c(23L, 10L), v2 = c(34L, 12L), 
v3 = c(12L, 39L), v4 = c(10L, 14L)), .Names = c("id", "v1",
"v2", "v3", "v4"), class = "data.frame", row.names = c(NA, -2L))


Related Topics



Leave a reply



Submit