For Each Group Summarise Means for All Variables in Dataframe (Ddply? Split)

for each group summarise means for all variables in dataframe (ddply? split?)

Given the format you want for the result, the reshape package will be more efficient than plyr.

test_data <- data.frame(
var0 = rnorm(100),
var1 = rnorm(100,1),
var2 = rnorm(100,2),
var3 = rnorm(100,3),
var4 = rnorm(100,4),
group = sample(letters[1:10],100,replace=T),
year = sample(c(2007,2009),100, replace=T))

library(reshape)
Molten <- melt(test_data, id.vars = c("group", "year"))
cast(group + variable ~ year, data = Molten, fun = mean)

The result looks like this

   group variable         2007         2009
1 a var0 0.003767891 0.340989068
2 a var1 2.009026385 1.162786943
3 a var2 1.861061882 2.676524736
4 a var3 2.998011426 3.311250399
5 a var4 3.979255971 4.165715967
6 b var0 -0.112883844 -0.179762343
7 b var1 1.342447279 1.199554144
8 b var2 2.486088196 1.767431740
9 b var3 3.261451449 2.934903824
10 b var4 3.489147597 3.076779626
11 c var0 0.493591055 -0.113469315
12 c var1 0.157424796 -0.186590644
13 c var2 2.366594176 2.458204041
14 c var3 3.485808031 2.817153628
15 c var4 3.681576886 3.057915666
16 d var0 0.360188789 1.205875725
17 d var1 1.271541181 0.898973536
18 d var2 1.824468264 1.944708165
19 d var3 2.323315162 3.550719308
20 d var4 3.852223640 4.647498956
21 e var0 -0.556751465 0.273865769
22 e var1 1.173899189 0.719520372
23 e var2 1.935402724 2.046313047
24 e var3 3.318669590 2.871462470
25 e var4 4.374478734 4.522511874
26 f var0 -0.258956555 -0.007729091
27 f var1 1.424479454 1.175242755
28 f var2 1.797948551 2.411030282
29 f var3 3.083169793 3.324584667
30 f var4 4.160641429 3.546527820
31 g var0 0.189038036 -0.683028110
32 g var1 0.429915866 0.827761101
33 g var2 1.839982321 1.513104866
34 g var3 3.106414330 2.755975622
35 g var4 4.599340239 3.691478466
36 h var0 0.015557352 -0.707257185
37 h var1 0.933199148 1.037655156
38 h var2 1.927442457 2.521369108
39 h var3 3.246734239 3.703213646
40 h var4 4.242387776 4.407960355
41 i var0 0.885226638 -0.288221276
42 i var1 1.216012653 1.502514588
43 i var2 2.302815441 1.905731471
44 i var3 2.026631277 2.836508446
45 i var4 4.800676814 4.772964668
46 j var0 -0.435661855 0.192703997
47 j var1 0.836814185 0.394505861
48 j var2 1.663523873 2.377640369
49 j var3 3.489536343 3.457597835
50 j var4 4.146020948 4.281599816

Using cast() or ddply() to summarise the mean for two continuous variables in one dataframe

It is not a ddply() or a cast() solution, but using tidyverse and reshape2 you can do:

df %>%
group_by(Date, Independent_Variable) %>%
summarise(Independent_Value = mean(Independent_Value)) %>%
mutate(Independent_Variable = paste(Independent_Variable, "IV", sep = "_")) %>%
dcast(Date~Independent_Variable, value.var = "Independent_Value") %>%
arrange(factor(Date, levels = month.name)) %>%
left_join(df %>%
group_by(Date, Independent_Variable) %>%
summarise(Sapflow = mean(Sapflow)) %>%
mutate(Independent_Variable = paste(Independent_Variable, "Sapflow", sep = "_")) %>%
dcast(Date~Independent_Variable, value.var = "Sapflow") %>%
arrange(factor(Date, levels = month.name)),
by = c("Date" = "Date"))

Date Humidity_IV Radiation_IV Temperature_IV Humidity_Sapflow
1 June 17.60733 263.6733 70.56133 16.067000
2 July 21.80065 270.9065 61.33065 23.356774
3 August 18.38968 178.9806 71.73355 22.941613
4 September 14.82200 152.2333 72.21367 19.309333
5 October 11.34867 93.6000 81.74300 6.700667
Radiation_Sapflow Temperature_Sapflow
1 16.067000 16.067000
2 23.356774 23.356774
3 22.941613 22.941613
4 19.309333 19.309333
5 6.700667 6.700667

First, it is grouping by "Date" and "Independent_Variable" and summarising "Independent_Value". Second, it is adding "_IV" to the values in Independent_Variable. Third, it is reshaping the data and arranging according the real order of months. Fourth, it is doing the first three steps for "Sapflow". Finally, it is merging the two.

Or by using just tidyverse:

df %>%
group_by(Date, Independent_Variable) %>% #Grouping
summarise_all(funs(mean = mean(.))) %>% #Summarising all variables and adding "_mean" to the new variables
arrange(factor(Date, levels = month.name)) #Arranging according the real order of months

Date Independent_Variable Independent_Value_mean Sapflow_mean
<fct> <fct> <dbl> <dbl>
1 June Humidity 17.6 16.1
2 June Radiation 264. 16.1
3 June Temperature 70.6 16.1
4 July Humidity 21.8 23.4
5 July Radiation 271. 23.4
6 July Temperature 61.3 23.4

Creating multiple subsets all in one data.frame (possibly with ddply)

You could try:

ddply(df, .(x), subset, rnorm.100. > quantile(rnorm.100., 0.8))

And off topic: you could use df <- data.frame(x,y=rnorm(100)) to name a column on-the-fly.

ddply to split and add rows to each group

I think this will do what you want:

AddRows <- function(df) {
new_numbers <- seq(from = min(df$numbers), to = 12)
new_numbers <- new_numbers[new_numbers != 0]
noms <- rep(unique(df$noms), length(new_numbers))
numbers <- c(df$numbers, rep(NA, length(new_numbers) - length(df$numbers)))

return(data.frame(noms, numbers, new_numbers))
}

ddply(df, .(noms), AddRows)

How to use a for loop to use ddply on multiple columns?

OP mentioned to use simple for-loop for this transformation on data. I understand that there are many other optimized way to solve this but in order to respect OP desired I tried using for-loop based solution. I have used dplyr as plyr is old now.

library(dplyr)
Subject <- c(rep(1, times = 6), rep(2, times = 6))
GroupOfInterest <- c(letters[rep(1:3, times = 4)])
Feature1 <- sample(1:20, 12, replace = T)
Feature2 <- sample(400:500, 12, replace = T)
Feature3 <- sample(1:5, 12, replace = T)
#small change in the way data.frame is created
df.main <- data.frame(Subject,GroupOfInterest, Feature1, Feature2,
Feature3, stringsAsFactors = FALSE)

Feat <- c(colnames(df.main[3:5]))

# Ready with Key columns on which grouping is done
resultdf <- unique(select(df.main, Subject, GroupOfInterest))
#> resultdf
# Subject GroupOfInterest
#1 1 a
#2 1 b
#3 1 c
#7 2 a
#8 2 b
#9 2 c

#For loop for each column
for(q in Feat){
summean <- paste0('mean(', q, ')')
summ_name <- paste0(q) #Name of the column to store sum
df_sum <- df.main %>%
group_by(Subject, GroupOfInterest) %>%
summarise_(.dots = setNames(summean, summ_name))
#merge the result of new sum column in resultdf
resultdf <- merge(resultdf, df_sum, by = c("Subject", "GroupOfInterest"))
}

# Final result
#> resultdf
# Subject GroupOfInterest Feature1 Feature2 Feature3
#1 1 a 6.5 473.0 3.5
#2 1 b 4.5 437.0 2.0
#3 1 c 12.0 415.5 3.5
#4 2 a 10.0 437.5 3.0
#5 2 b 3.0 447.0 4.5
#6 2 c 6.0 462.0 2.5

How to use ddply to get weighted-mean of class in dataframe?

You might find what you want in the ?summarise function. I can replicate your code with summarise as follows:

library(plyr)
set.seed(123)
frame <- data.frame(class=sample(LETTERS[1:5], replace = TRUE), x=rnorm(20),
x2 = rnorm(20), weights=rnorm(20))
ddply(frame, .(class), summarise,
x2 = weighted.mean(x2, weights))

To do this for x as well, just add that line to be passed into the summarise function:

ddply(frame, .(class), summarise, 
x = weighted.mean(x, weights),
x2 = weighted.mean(x2, weights))

Edit: If you want to do an operation over many columns, use colwise or numcolwise instead of summarise, or do summarise on a melted data frame with the reshape2 package, then cast back to original form. Here's an example.


That would give:

wmean.vars <- c("x", "x2")

ddply(frame, .(class), function(x)
colwise(weighted.mean, w = x$weights)(x[wmean.vars]))

Finally, if you don't like having to specify wmean.vars, you can also do:

ddply(frame, .(class), function(x)
numcolwise(weighted.mean, w = x$weights)(x[!colnames(x) %in% "weights"]))

which will compute a weighted-average for every numerical field, excluding the weights themselves.

ddply type functionality on multiple datafrmaes

In the case of the sample data, you could merge the two data sets like this (by specifying all.y = TRUE you can make sure that all rows of dfb are kept and, in this case, corresponding entries of dfa are repeated accordingly)

dfall <- merge(dfa, dfb, by = c("id", "month"), all.y=TRUE)

# id month sqft traf price frequency day
#1 1027 1 1 31 6 188 1
#2 1027 1 1 31 1 198 15
#3 1027 1 1 31 2 123 30
#4 1027 2 2 31 4 185 1
#5 1027 2 2 31 5 122 15
#6 1030 1 16 35 8 196 1
#7 1030 1 16 35 9 101 15
#8 1030 1 16 35 10 156 30
#9 1030 2 15 32 3 137 1
#10 1030 2 15 32 7 190 15

Then, you can use ddply as usual:

ddply(dfall, .(id, month), mutate, newcol = mean(price)/sqft)
# id month sqft traf price frequency day newcol
#1 1027 1 1 31 6 188 1 3.0000000
#2 1027 1 1 31 1 198 15 3.0000000
#3 1027 1 1 31 2 123 30 3.0000000
#4 1027 2 2 31 4 185 1 2.2500000
#5 1027 2 2 31 5 122 15 2.2500000
#6 1030 1 16 35 8 196 1 0.5625000
#7 1030 1 16 35 9 101 15 0.5625000
#8 1030 1 16 35 10 156 30 0.5625000
#9 1030 2 15 32 3 137 1 0.3333333
#10 1030 2 15 32 7 190 15 0.3333333

Edit: if you're looking for better performance, consider using dplyr instead of plyr. The equivalent dplyr code (including the merge) is:

library(dplyr)

dfall <- dfb %>%
left_join(., dfa, by = c("id", "month")) %>%
group_by(id, month) %>%
dplyr::mutate(newcol = mean(price)/sqft) # I added dplyr:: to avoid confusion with plyr::mutate

Of course, you could also check out data.table which is also very efficient.

AFAIK ddply is not designed to be used with different data frames at the same time.

How does ddply handle factors as split variables?

Since you are reducing hte number of rows, you need to use summarise:

> ddply(df1, .(Aa, Bb), summarise, mean_x =mean(x) )
Aa Bb mean_x
1 a b 0.3790675
2 a B 0.4242922
3 A b 0.5622329
4 A B 0.4574471

It's just as easy to use aggregate in this instance. Let's say you had two variables:

> aggregate(df1[-(1:2)], df1[1:2], mean)
Aa Bb x y
1 a b 0.4249121 0.4639192
2 A b 0.6127175 0.4639192
3 a B 0.4522292 0.4826715
4 A B 0.5201965 0.4826715


Related Topics



Leave a reply



Submit