Using R to Do a Regression with Multiple Dependent and Multiple Independent Variables

How to write a function that will run multiple regression models of the same type with different dependent variables and then store them as lists?

Consider reformulate to dynamically change model formulas using character values for lm calls:

# VECTOR OF COLUMN NAMES (NOT VALUES)
dep.vars <- c("dep.var1", "dep.var2")

# USER-DEFINED METHOD TO PROCESS DIFFERENT DEP VAR
run_model <- function(dep.var) {
fml <- reformulate(c("x1", "x2"), dep.var)
lm(fml, data=data)
}

# NAMED LIST OF MODELS
all_models <- sapply(dep.vars, run_model, simplify = FALSE)

# OUTPUT RESULTS
all_models$dep.var1
all_models$dep.var2
...

From there, you can run further extractions or processes across model objects:

# NAMED LIST OF MODEL SUMMARIES
all_summaries <- lapply(all_models, summary)

all_summaries$dep.var1
all_summaries$dep.var2
...

# NAMED LIST OF MODEL COEFFICIENTS
all_coefficients <- lapply(all_models, `[`, "coefficients")

all_coefficients$dep.var1
all_coefficients$dep.var2
...

Write a function to run multiple regression models with changing independent variables and changing dependent variables in R

You could achieve your desired result like so:

  1. The issue with your code is that y ~ ... will not work. Instead you could use reformulate (or as.formula) to dynamically create the formula for your regression model.
  2. To make this work loop directly over the character vector x or more more precisely setNames(x, x) instead of looping over dataset %>% select(all_of(x)).
library(dplyr)
library(purrr)
library(broom)

function_name <- function(x, y, dataset) {
map(setNames(x, x), ~ glm(reformulate(
termlabels = c(.x, "cyl", "disp", "splines::ns(wt, 2)", "hp"),
response = y
),
family = gaussian(link = "identity"),
data = dataset
)) %>%
map_dfr(tidy, conf.int = T, .id = "source") %>%
select(source, source, term, estimate, std.error, conf.low, conf.high, p.value)
}

var <- c("vs", "am")

function_name(x = var, y = "mpg", mtcars)
#> # A tibble: 14 × 7
#> source term estimate std.error conf.low conf.high p.value
#> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 vs (Intercept) 32.7 3.49 25.8 39.5 1.24e- 9
#> 2 vs vs 1.03 1.52 -1.95 4.01 5.05e- 1
#> 3 vs cyl -0.187 0.821 -1.80 1.42 8.21e- 1
#> 4 vs disp 0.000545 0.0119 -0.0228 0.0239 9.64e- 1
#> 5 vs splines::ns(wt, 2)1 -22.4 4.82 -31.9 -13.0 9.02e- 5
#> 6 vs splines::ns(wt, 2)2 -9.48 3.16 -15.7 -3.28 6.09e- 3
#> 7 vs hp -0.0202 0.0115 -0.0427 0.00226 9.02e- 2
#> 8 am (Intercept) 34.6 2.65 29.4 39.8 1.15e-12
#> 9 am am 0.0113 1.57 -3.06 3.08 9.94e- 1
#> 10 am cyl -0.470 0.714 -1.87 0.931 5.17e- 1
#> 11 am disp 0.000796 0.0125 -0.0236 0.0252 9.50e- 1
#> 12 am splines::ns(wt, 2)1 -21.5 5.86 -33.0 -10.0 1.14e- 3
#> 13 am splines::ns(wt, 2)2 -9.21 3.34 -15.8 -2.66 1.07e- 2
#> 14 am hp -0.0214 0.0136 -0.0480 0.00527 1.28e- 1

Fit regression model to multiple independent and dependent variables and obtain separate fits by grouping variable

Here I'm using 2 dependent variables (drat,mpg), 3 independent variables (disp,hp,wt) and 1 grouping variable with 2 levels/classes (am as 1/0).

library(dplyr)
library(tidyr)

# example dataset (picking useful columns)
dt <- data.frame(mtcars) %>% select(drat, mpg, disp, hp, wt, am)

# specify which columns we want as y (dependent) and x (independent)
# grouping variable is specified within the dependent variables
ynames <- c("drat","mpg","am")
xnames <- c("disp","hp","wt")

# create and reshape datasets
dt1 <- dt[,ynames]
dt1 <- gather(dt1, y, yvalue, -am)

dt2 <- dt[,xnames]
dt2 <- gather(dt2, x, xvalue)

dt1 %>%
group_by(y) %>%
do(data.frame(.,dt2)) %>%
group_by(y,x,am) %>%
do({ m1 <- nls( yvalue ~ B0*(xvalue^B1)*exp(B2*xvalue), data=., start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
RSS <- sum(residuals(m1)^2)
TSS <- sum((.$yvalue - mean(.$yvalue))^2)
data.frame(RSS,TSS) })

# y x am RSS TSS
# 1 drat disp 0 1.3090406 2.770242
# 2 drat disp 1 1.1155372 1.590400
# 3 drat hp 0 2.1707337 2.770242
# 4 drat hp 1 0.8342527 1.590400
# 5 drat wt 0 2.2100162 2.770242
# 6 drat wt 1 1.1885811 1.590400
# 7 mpg disp 0 98.4815286 264.587368
# 8 mpg disp 1 46.8674036 456.309231
# 9 mpg hp 0 74.9295161 264.587368
# 10 mpg hp 1 112.5548955 456.309231
# 11 mpg wt 0 104.2894519 264.587368
# 12 mpg wt 1 71.1402536 456.309231

As you can see the method above reshapes data and creates a bigger dataset with all y and x variable combinations needed. You might have problems if you end up having a huge dataset. Or maybe someone else who's having a similar problem needs to deal with variables with big lengths and creating that big data set creates issues.

It's better to create the formula we need for each model fit instead of creating combinations of variables. This approach is similar to what @BondedDust suggested below.

library(dplyr)

# example dataset (picking useful columns)
dt <- data.frame(mtcars) %>% select(drat, mpg, disp, hp, wt, am)

# specify which columns we want as y (dependent) and x (independent)
ynames <- c("drat","mpg")
xnames <- c("disp","hp","wt")

# get unique values of the grouping variable am
groupvalues = unique(dt$am)

expand.grid(ynames,xnames,groupvalues) %>%
data.frame() %>%
select(y=Var1, x=Var2, group=Var3) %>%
mutate(formula = paste0(y," ~ B0*(",x,"^B1)*exp(B2*",x,")")) %>%
group_by(y,x,group,formula) %>%
do({ m1 <- nls( .$formula, data=dt[dt$am==.$group,],
start=c(B0 = 45, B1 = 0.2, B2 = 0.0007))
RSS <- sum(residuals(m1)^2)
TSS <- sum((dt[dt$am==.$group,][,.$y]- mean(dt[dt$am==.$group,][,.$y]))^2)
data.frame(RSS,TSS) })

# y x group formula RSS TSS
# 1 drat disp 0 drat ~ B0*(disp^B1)*exp(B2*disp) 1.3090406 2.770242
# 2 drat disp 1 drat ~ B0*(disp^B1)*exp(B2*disp) 1.1155372 1.590400
# 3 drat hp 0 drat ~ B0*(hp^B1)*exp(B2*hp) 2.1707337 2.770242
# 4 drat hp 1 drat ~ B0*(hp^B1)*exp(B2*hp) 0.8342527 1.590400
# 5 drat wt 0 drat ~ B0*(wt^B1)*exp(B2*wt) 2.2100162 2.770242
# 6 drat wt 1 drat ~ B0*(wt^B1)*exp(B2*wt) 1.1885811 1.590400
# 7 mpg disp 0 mpg ~ B0*(disp^B1)*exp(B2*disp) 98.4815286 264.587368
# 8 mpg disp 1 mpg ~ B0*(disp^B1)*exp(B2*disp) 46.8674036 456.309231
# 9 mpg hp 0 mpg ~ B0*(hp^B1)*exp(B2*hp) 74.9295161 264.587368
# 10 mpg hp 1 mpg ~ B0*(hp^B1)*exp(B2*hp) 112.5548955 456.309231
# 11 mpg wt 0 mpg ~ B0*(wt^B1)*exp(B2*wt) 104.2894519 264.587368
# 12 mpg wt 1 mpg ~ B0*(wt^B1)*exp(B2*wt) 71.1402536 456.309231

Run all posible combination of linear regression with 2 independent variables

As mentioned in the comments under the question check whether you need y or Y. Having addressed that we can use any of these. There is no need to rename the columns. We use the built in mtcars data set as an example since no test data was provided in the question. (Please always provide that in the future.)

1) ExhaustiveSearch This runs quite fast so you might be able to try combinations higher than 2 as well.

library(ExhaustiveSearch)
ExhaustiveSearch(mpg ~., mtcars, combsUpTo = 2)

2) combn Use the lmfun function defined below with combn.

dep <- "mpg"  # name of dependent variable
nms <- setdiff(names(mtcars), dep) # names of indep variables
lmfun <- function(x, dep) do.call("lm", list(reformulate(x, dep), quote(mtcars)))

lms <- combn(nms, 2, lmfun, dep = dep, simplify = FALSE)
names(lms) <- lapply(lms, formula)

3) listcompr Using lmfun from above and listcompr we can use the following. Note that we need version 0.1.1 or later of listcompr which is not yet on CRAN so we get it from github.

# install.github("patrickroocks/listcompr")
library(listcompr)
packageVersion("listcompr") # need version 0.1.1 or later

dep <- "mpg" # name of dependent variable
nms <- setdiff(names(mtcars), dep) # names of indep variables

lms2 <- gen.named.list("{nm1}.{nm2}", lmfun(c(nm1, nm2), dep),
nm1 = nms, nm2 = nms, nm1 < nm2)

map/loop different regression models for multiple dependent and independent variables

Here is a possible solution

library(dplyr)
library(tidyr)
library(purrr)

models <- function(.x) {
linear <- lm(flux_value ~ expl_value, data = .x)
exponential <- lm(log(flux_value ) ~ expl_value, data = .x)
logarithmic <- lm(flux_value ~ log(expl_value), data = .x)
power <- lm(log(flux_value) ~ log(expl_value), data = .x)
list(linear = linear ,
exponential = exponential,
logarithmic = logarithmic,
power = power) |>
map_dfr(broom::glance, .id = "model")
}

dat |>
pivot_longer(values_to = "flux_value",
names_to = "flux",
c(flux1:flux3)) |>
group_by(flux) |>
mutate(flux_value = if(any(flux_value < 0))
{
flux_value - min(flux_value) + 1e-6
}
else
flux_value) |>
ungroup() |>
pivot_longer(values_to = "expl_value",
names_to = "expl",
c(expl1:expl3)) |>
group_by(expl, flux) |>
group_modify(~models(.x))

## A tibble: 36 × 15
## Groups: expl, flux [9]
# expl flux model r.squared adj.r.squared sigma statistic p.value df
# <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 expl1 flux1 linear 0.0572 -0.0153 15.7 0.789 0.391 1
# 2 expl1 flux1 exponenti… 0.0670 -0.00478 0.188 0.933 0.352 1
# 3 expl1 flux1 logarithm… 0.0365 -0.0376 15.9 0.492 0.495 1
# 4 expl1 flux1 power 0.0448 -0.0287 0.190 0.609 0.449 1
# 5 expl1 flux2 linear 0.0167 -0.0589 3.55 0.221 0.646 1
# 6 expl1 flux2 exponenti… 0.00333 -0.0733 4.08 0.0435 0.838 1
# 7 expl1 flux2 logarithm… 0.00764 -0.0687 3.57 0.100 0.757 1
# 8 expl1 flux2 power 0.000465 -0.0764 4.09 0.00605 0.939 1
# 9 expl1 flux3 linear 0.00756 -0.0688 26.9 0.0991 0.758 1
#10 expl1 flux3 exponenti… 0.000325 -0.0766 4.81 0.00422 0.949 1
## … with 26 more rows, and 6 more variables: logLik <dbl>, AIC <dbl>,
## BIC <dbl>, deviance <dbl>, df.residual <int>, nobs <int>


Related Topics



Leave a reply



Submit