Apply grouped model back onto data
Here is a dplyr
method of obtaining a similar answer, following the approach used by @Mike.Gahan :
library(dplyr)
iris.models <- iris %>%
group_by(Species) %>%
do(mod = lm(Sepal.Length ~ Sepal.Width, data = .))
iris %>%
tbl_df %>%
left_join(iris.models) %>%
rowwise %>%
mutate(Sepal.Length_pred = predict(mod,
newdata = list("Sepal.Width" = Sepal.Width)))
alternatively you can do it in one step if you create a predicting function:
m <- function(df) {
mod <- lm(Sepal.Length ~ Sepal.Width, data = df)
pred <- predict(mod,newdata = df["Sepal.Width"])
data.frame(df,pred)
}
iris %>%
group_by(Species) %>%
do(m(.))
Apply linear model formula to grouped data
Try lmList
. Note that the nlme package already comes with R.
library(nlme)
coef(lmList(Outcome ~ A | Participant, mydata))
giving:
(Intercept) A
1 8.122188 -0.079910741
2 2.111455 0.001547988
3 1.722062 0.304546146
4 -2.127148 0.164948454
5 -1.883623 0.076522166
6 2.463768 0.103024575
7 7.133361 -0.043622767
8 0.000000 0.000000000
9 1.370920 0.006923838
10 8.286374 0.081986143
11 -5.359477 0.283224401
12 -4.486884 0.143756558
13 -1.333333 0.034188034
14 0.000000 NA
Grouped application of function that return a data.frame (without a for loop)
One option is to use dplyr::group_split()
and purrr::map_dfr()
.
How this works: group_split()
will divide your data.frame df
into a list of data.frames based on the grouping variables you supply (e.g., g
). Next, map_dfr()
can be used to apply a function to each element of that list. Because your custom function ff()
returns a data.frame without your grouping variable g
, you'll want to add that information back to ff()
output - this can be accomplished with mutate()
as in the example below:
library(dplyr)
library(purrr)
# set seed so that example is reproducible
set.seed(1)
# your example data and function
df <- data.frame(start=1:10,end=21:30,g=sample(LETTERS[1:2],10,replace=TRUE))
ff <- function(start,end,... ) {
out <- data.frame(T1=c(start,rev(start)),T2=c(end,rev(end)))
return(out)
}
# use group_split & map_dfr
df %>%
# divide df into a list of data.frames based on supplied grouping variables
group_split(g) %>%
# for each element in the list, apply this function
map_dfr(function(df.x) {
with(df.x,
# get the data.frame your function returns
ff(start, end) %>%
# add your grouping variables back-in (stripped by ff)
mutate(g = g[1]))
})
# a short-hand version of the above can be written as:
df %>%
group_split(g) %>%
map_dfr(~ff(.x$start, .x$end) %>% mutate(g = .x$g[1]))
Using group by and tidy to run several models and extract results to dataframe
Here's a solution that first creates the formulas for each model you want to run and then calls the right variables from the dataset you want to analyse, instead of reshaping the dataset itself and apply the models:
library(tidyverse)
library(broom)
outcomes <- c("wt", "mpg", "hp", "disp")
exposures <- c("gear", "vs", "am")
covariates <- c("drat", "qsec")
expand.grid(outcomes, exposures, covariates) %>%
group_by(Var1, Var2) %>%
summarise(Var3 = paste0(Var3, collapse = "+")) %>%
rowwise() %>%
summarise(frm = paste0(Var1, "~factor(", Var2, ")+", Var3)) %>%
group_by(model_id = row_number(),
frm) %>%
do(tidy(lm(.$frm, data = mtcars))) %>%
ungroup()
# # A tibble: 52 x 7
# model_id frm term estimate std.error statistic p.value
# <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 1 wt~factor(gear)+drat+qsec (Intercept) 9.25 2.17 4.27 0.000218
# 2 1 wt~factor(gear)+drat+qsec factor(gear)4 -0.187 0.493 -0.378 0.708
# 3 1 wt~factor(gear)+drat+qsec factor(gear)5 -0.703 0.518 -1.36 0.186
# 4 1 wt~factor(gear)+drat+qsec drat -1.03 0.425 -2.42 0.0227
# 5 1 wt~factor(gear)+drat+qsec qsec -0.121 0.0912 -1.32 0.196
# 6 2 wt~factor(vs)+drat+qsec (Intercept) 4.35 2.28 1.91 0.0663
# 7 2 wt~factor(vs)+drat+qsec factor(vs)1 -1.04 0.416 -2.49 0.0189
# 8 2 wt~factor(vs)+drat+qsec drat -0.918 0.263 -3.49 0.00160
# 9 2 wt~factor(vs)+drat+qsec qsec 0.147 0.106 1.39 0.175
# 10 3 wt~factor(am)+drat+qsec (Intercept) 8.29 1.31 6.33 0.000000766
# # ... with 42 more rows
Very similar process in case you prefer to use map
from purrr
package instead of do
:
expand.grid(outcomes, exposures, covariates) %>%
group_by(Var1, Var2) %>%
summarise(Var3 = paste0(Var3, collapse = "+")) %>%
rowwise() %>%
summarise(frm = paste0(Var1, "~factor(", Var2, ")+", Var3)) %>%
group_by(model_id = row_number()) %>%
mutate(model = map(frm, ~tidy(lm(., data = mtcars)))) %>%
unnest() %>%
ungroup()
Remember that the key to this approach is creating the formulas.
So, the code will become simpler if you manage to specify your variables in a slightly different way and help creating the formulas with less code than before:
outcomes <- c("wt", "mpg", "hp", "disp")
exposures <- c("gear", "vs", "am")
covariate1 <- "drat"
covariate2 <- "qsec"
expand.grid(outcomes, exposures, covariate1, covariate2) %>%
transmute(frm = paste0(Var1, "~factor(", Var2, ")+", Var3, "+", Var4)) %>%
group_by(model_id = row_number()) %>%
mutate(model = map(frm, ~tidy(lm(., data = mtcars)))) %>%
unnest() %>%
ungroup()
Running linear models for groups within dataframe and storing outputs in dataframe in R
Use glance
instead of tidy
:
dt_lm <- dt %>%
group_by(group) %>%
do(glance(lm(y~x, data=.))) %>%
select(AIC)
which gives:
Adding missing grouping variables: `group`
# A tibble: 2 x 2
# Groups: group [2]
group AIC
<chr> <dbl>
1 a 119.
2 b 114.
If you not only want to store the AIC but other metrics just skip the select
part.
How can I apply grouped data to grouped models using broom and dplyr?
I've used map2
from package purrr for this sort of situation. map2
loops through the elements of two lists simultaneously. The lists must be the same length and be in the same order.
The elements of the lists are used as arguments for some function you want to apply (augment
, in your case). Here your two lists would be a list of models and a list of datasets (one list for each cyl
/am
combination).
Using map2_df
returns the results as a data.frame instead of a list.
library(purrr)
I made the list of data.frames to predict with using split
. The order of the factors to split on determined the list order, so I made sure it was in the same order as lm1
.
test_split = split(newdata, list(newdata$am, newdata$cyl)
map2_df(lm1$fit, test_split, ~augment(.x, newdata = .y))
To avoid worrying about order so much, you could nest
the prediction data by groups, join this to lm1
, and return the results of augment
as a list for unnesting.
newdata %>%
group_by(cyl, am) %>%
nest() %>%
inner_join(lm1, .) %>%
mutate(pred = list(augment(fit, newdata = data))) %>%
unnest(pred)
dplyr version of grouping a dataframe then creating regression model on each group
Returning a list from dplyr
is not possible yet. If you just need the intercept and slope @jazzurro 's answer is the way, but if you need the whole model you need to do something like
library(dplyr)
models <- df %>% group_by(country) %>% do(mod = lm(BirthRate ~ US., data = .))
Then if you want to perform ANOVA on each fitted model, you can do it using rowwise
models %>% rowwise %>% do(anova(.$mod))
but again the result is coerced to a data frame and is not quite the same as doing lapply(models$mod, anova)
.
For now (ie until the next version of dplyr
) if you need to store the whole result in a list, you can just use dlply
from plyr
, like plyr::dlply(df, "country", function(d) anova(lm(BirthRate ~ US., data = d)))
, or of course if you do not absolutely have to use dplyr
you can go for @SvenHohenstein 's answer which looks like a better way of doing this anyway.
Related Topics
Replace Na with 0 in a Data Frame Column
How to Attach a Simple Data.Frame to a Spatialpolygondataframe in R
How to Adjust Facet Size Manually
R - Store a Matrix into a Single Dataframe Cell
R-Shiny Using Reactive Renderui Value
Plot Mean and Sd of Dataset Per X Value Using Ggplot2
R: String Fuzzy Matching Using Jarowinkler
Adding S4 Dispatch to Base R S3 Generic
Dynamic Arguments to Expand.Grid
Shiny: How to Adjust the Width of the Tabsetpanel
Create Category Based on Range in R
How to Rotate an Image R Raster
Convert Column in Data.Frame to Date
Conditional Rolling Mean (Moving Average) on Irregular Time Series