for each group summarise means for all variables in dataframe (ddply? split?)
Given the format you want for the result, the reshape package will be more efficient than plyr.
test_data <- data.frame(
var0 = rnorm(100),
var1 = rnorm(100,1),
var2 = rnorm(100,2),
var3 = rnorm(100,3),
var4 = rnorm(100,4),
group = sample(letters[1:10],100,replace=T),
year = sample(c(2007,2009),100, replace=T))
library(reshape)
Molten <- melt(test_data, id.vars = c("group", "year"))
cast(group + variable ~ year, data = Molten, fun = mean)
The result looks like this
group variable 2007 2009
1 a var0 0.003767891 0.340989068
2 a var1 2.009026385 1.162786943
3 a var2 1.861061882 2.676524736
4 a var3 2.998011426 3.311250399
5 a var4 3.979255971 4.165715967
6 b var0 -0.112883844 -0.179762343
7 b var1 1.342447279 1.199554144
8 b var2 2.486088196 1.767431740
9 b var3 3.261451449 2.934903824
10 b var4 3.489147597 3.076779626
11 c var0 0.493591055 -0.113469315
12 c var1 0.157424796 -0.186590644
13 c var2 2.366594176 2.458204041
14 c var3 3.485808031 2.817153628
15 c var4 3.681576886 3.057915666
16 d var0 0.360188789 1.205875725
17 d var1 1.271541181 0.898973536
18 d var2 1.824468264 1.944708165
19 d var3 2.323315162 3.550719308
20 d var4 3.852223640 4.647498956
21 e var0 -0.556751465 0.273865769
22 e var1 1.173899189 0.719520372
23 e var2 1.935402724 2.046313047
24 e var3 3.318669590 2.871462470
25 e var4 4.374478734 4.522511874
26 f var0 -0.258956555 -0.007729091
27 f var1 1.424479454 1.175242755
28 f var2 1.797948551 2.411030282
29 f var3 3.083169793 3.324584667
30 f var4 4.160641429 3.546527820
31 g var0 0.189038036 -0.683028110
32 g var1 0.429915866 0.827761101
33 g var2 1.839982321 1.513104866
34 g var3 3.106414330 2.755975622
35 g var4 4.599340239 3.691478466
36 h var0 0.015557352 -0.707257185
37 h var1 0.933199148 1.037655156
38 h var2 1.927442457 2.521369108
39 h var3 3.246734239 3.703213646
40 h var4 4.242387776 4.407960355
41 i var0 0.885226638 -0.288221276
42 i var1 1.216012653 1.502514588
43 i var2 2.302815441 1.905731471
44 i var3 2.026631277 2.836508446
45 i var4 4.800676814 4.772964668
46 j var0 -0.435661855 0.192703997
47 j var1 0.836814185 0.394505861
48 j var2 1.663523873 2.377640369
49 j var3 3.489536343 3.457597835
50 j var4 4.146020948 4.281599816
Using cast() or ddply() to summarise the mean for two continuous variables in one dataframe
It is not a ddply()
or a cast()
solution, but using tidyverse
and reshape2
you can do:
df %>%
group_by(Date, Independent_Variable) %>%
summarise(Independent_Value = mean(Independent_Value)) %>%
mutate(Independent_Variable = paste(Independent_Variable, "IV", sep = "_")) %>%
dcast(Date~Independent_Variable, value.var = "Independent_Value") %>%
arrange(factor(Date, levels = month.name)) %>%
left_join(df %>%
group_by(Date, Independent_Variable) %>%
summarise(Sapflow = mean(Sapflow)) %>%
mutate(Independent_Variable = paste(Independent_Variable, "Sapflow", sep = "_")) %>%
dcast(Date~Independent_Variable, value.var = "Sapflow") %>%
arrange(factor(Date, levels = month.name)),
by = c("Date" = "Date"))
Date Humidity_IV Radiation_IV Temperature_IV Humidity_Sapflow
1 June 17.60733 263.6733 70.56133 16.067000
2 July 21.80065 270.9065 61.33065 23.356774
3 August 18.38968 178.9806 71.73355 22.941613
4 September 14.82200 152.2333 72.21367 19.309333
5 October 11.34867 93.6000 81.74300 6.700667
Radiation_Sapflow Temperature_Sapflow
1 16.067000 16.067000
2 23.356774 23.356774
3 22.941613 22.941613
4 19.309333 19.309333
5 6.700667 6.700667
First, it is grouping by "Date" and "Independent_Variable" and summarising "Independent_Value". Second, it is adding "_IV" to the values in Independent_Variable. Third, it is reshaping the data and arranging according the real order of months. Fourth, it is doing the first three steps for "Sapflow". Finally, it is merging the two.
Or by using just tidyverse
:
df %>%
group_by(Date, Independent_Variable) %>% #Grouping
summarise_all(funs(mean = mean(.))) %>% #Summarising all variables and adding "_mean" to the new variables
arrange(factor(Date, levels = month.name)) #Arranging according the real order of months
Date Independent_Variable Independent_Value_mean Sapflow_mean
<fct> <fct> <dbl> <dbl>
1 June Humidity 17.6 16.1
2 June Radiation 264. 16.1
3 June Temperature 70.6 16.1
4 July Humidity 21.8 23.4
5 July Radiation 271. 23.4
6 July Temperature 61.3 23.4
Creating multiple subsets all in one data.frame (possibly with ddply)
You could try:
ddply(df, .(x), subset, rnorm.100. > quantile(rnorm.100., 0.8))
And off topic: you could use df <- data.frame(x,y=rnorm(100))
to name a column on-the-fly.
ddply to split and add rows to each group
I think this will do what you want:
AddRows <- function(df) {
new_numbers <- seq(from = min(df$numbers), to = 12)
new_numbers <- new_numbers[new_numbers != 0]
noms <- rep(unique(df$noms), length(new_numbers))
numbers <- c(df$numbers, rep(NA, length(new_numbers) - length(df$numbers)))
return(data.frame(noms, numbers, new_numbers))
}
ddply(df, .(noms), AddRows)
How to use a for loop to use ddply on multiple columns?
OP
mentioned to use simple for-loop
for this transformation on data. I understand that there are many other optimized way to solve this but in order to respect OP
desired I tried using for-loop
based solution. I have used dplyr
as plyr
is old now.
library(dplyr)
Subject <- c(rep(1, times = 6), rep(2, times = 6))
GroupOfInterest <- c(letters[rep(1:3, times = 4)])
Feature1 <- sample(1:20, 12, replace = T)
Feature2 <- sample(400:500, 12, replace = T)
Feature3 <- sample(1:5, 12, replace = T)
#small change in the way data.frame is created
df.main <- data.frame(Subject,GroupOfInterest, Feature1, Feature2,
Feature3, stringsAsFactors = FALSE)
Feat <- c(colnames(df.main[3:5]))
# Ready with Key columns on which grouping is done
resultdf <- unique(select(df.main, Subject, GroupOfInterest))
#> resultdf
# Subject GroupOfInterest
#1 1 a
#2 1 b
#3 1 c
#7 2 a
#8 2 b
#9 2 c
#For loop for each column
for(q in Feat){
summean <- paste0('mean(', q, ')')
summ_name <- paste0(q) #Name of the column to store sum
df_sum <- df.main %>%
group_by(Subject, GroupOfInterest) %>%
summarise_(.dots = setNames(summean, summ_name))
#merge the result of new sum column in resultdf
resultdf <- merge(resultdf, df_sum, by = c("Subject", "GroupOfInterest"))
}
# Final result
#> resultdf
# Subject GroupOfInterest Feature1 Feature2 Feature3
#1 1 a 6.5 473.0 3.5
#2 1 b 4.5 437.0 2.0
#3 1 c 12.0 415.5 3.5
#4 2 a 10.0 437.5 3.0
#5 2 b 3.0 447.0 4.5
#6 2 c 6.0 462.0 2.5
How to use ddply to get weighted-mean of class in dataframe?
You might find what you want in the ?summarise
function. I can replicate your code with summarise
as follows:
library(plyr)
set.seed(123)
frame <- data.frame(class=sample(LETTERS[1:5], replace = TRUE), x=rnorm(20),
x2 = rnorm(20), weights=rnorm(20))
ddply(frame, .(class), summarise,
x2 = weighted.mean(x2, weights))
To do this for x
as well, just add that line to be passed into the summarise
function:
ddply(frame, .(class), summarise,
x = weighted.mean(x, weights),
x2 = weighted.mean(x2, weights))
Edit: If you want to do an operation over many columns, use colwise
or numcolwise
instead of summarise
, or do summarise
on a melt
ed data frame with the reshape2
package, then cast
back to original form. Here's an example.
That would give:
wmean.vars <- c("x", "x2")
ddply(frame, .(class), function(x)
colwise(weighted.mean, w = x$weights)(x[wmean.vars]))
Finally, if you don't like having to specify wmean.vars
, you can also do:
ddply(frame, .(class), function(x)
numcolwise(weighted.mean, w = x$weights)(x[!colnames(x) %in% "weights"]))
which will compute a weighted-average for every numerical field, excluding the weights themselves.
ddply type functionality on multiple datafrmaes
In the case of the sample data, you could merge
the two data sets like this (by specifying all.y = TRUE
you can make sure that all rows of dfb are kept and, in this case, corresponding entries of dfa are repeated accordingly)
dfall <- merge(dfa, dfb, by = c("id", "month"), all.y=TRUE)
# id month sqft traf price frequency day
#1 1027 1 1 31 6 188 1
#2 1027 1 1 31 1 198 15
#3 1027 1 1 31 2 123 30
#4 1027 2 2 31 4 185 1
#5 1027 2 2 31 5 122 15
#6 1030 1 16 35 8 196 1
#7 1030 1 16 35 9 101 15
#8 1030 1 16 35 10 156 30
#9 1030 2 15 32 3 137 1
#10 1030 2 15 32 7 190 15
Then, you can use ddply
as usual:
ddply(dfall, .(id, month), mutate, newcol = mean(price)/sqft)
# id month sqft traf price frequency day newcol
#1 1027 1 1 31 6 188 1 3.0000000
#2 1027 1 1 31 1 198 15 3.0000000
#3 1027 1 1 31 2 123 30 3.0000000
#4 1027 2 2 31 4 185 1 2.2500000
#5 1027 2 2 31 5 122 15 2.2500000
#6 1030 1 16 35 8 196 1 0.5625000
#7 1030 1 16 35 9 101 15 0.5625000
#8 1030 1 16 35 10 156 30 0.5625000
#9 1030 2 15 32 3 137 1 0.3333333
#10 1030 2 15 32 7 190 15 0.3333333
Edit: if you're looking for better performance, consider using dplyr
instead of plyr
. The equivalent dplyr code (including the merge) is:
library(dplyr)
dfall <- dfb %>%
left_join(., dfa, by = c("id", "month")) %>%
group_by(id, month) %>%
dplyr::mutate(newcol = mean(price)/sqft) # I added dplyr:: to avoid confusion with plyr::mutate
Of course, you could also check out data.table
which is also very efficient.
AFAIK ddply
is not designed to be used with different data frames at the same time.
How does ddply handle factors as split variables?
Since you are reducing hte number of rows, you need to use summarise
:
> ddply(df1, .(Aa, Bb), summarise, mean_x =mean(x) )
Aa Bb mean_x
1 a b 0.3790675
2 a B 0.4242922
3 A b 0.5622329
4 A B 0.4574471
It's just as easy to use aggregate in this instance. Let's say you had two variables:
> aggregate(df1[-(1:2)], df1[1:2], mean)
Aa Bb x y
1 a b 0.4249121 0.4639192
2 A b 0.6127175 0.4639192
3 a B 0.4522292 0.4826715
4 A B 0.5201965 0.4826715
Related Topics
How to Install Multiple Packages
How to Remove "Rows" with a Na Value
Clustering List for Hclust Function
More Efficient Means of Creating a Corpus and Dtm with 4M Rows
Select Unique Values with 'Select' Function in 'Dplyr' Library
Extract Random Effect Variances from Lme4 Mer Model Object
R: Legend with Points and Lines Being Different Colors (For the Same Legend Item)
Remove Strip Background Keep Panel Border
How to Create Textarea as Input in a Shiny Webapp in R
Comparison Between Dplyr::Do/Purrr::Map, What Advantages
R Plot Color Combinations That Are Colorblind Accessible
R Sequence of Dates with Lubridate
Difference Between As.Data.Frame(X) and Data.Frame(X)
Weird As.Posixct Behavior Depending on Daylight Savings Time