Looping Over Combinations of Regression Model Terms

Looping over combinations of regression model terms

With this dummy data:

dat1 <- data.frame(y = rpois(100,5),
x1 = runif(100),
x2 = runif(100),
x3 = runif(100),
z1 = runif(100),
z2 = runif(100)
)

You could get your list of two lm objects this way:

 lapply(dat1[5:6], function(x) lm(dat1$y ~ dat1$x1 + dat1$x2 + dat1$x3 + x))

Which iterates through those two columns and substitutes them as arguments into the lm call.

As Alex notes below, it's preferable to pass the names through the formula, rather than the actual data columns as I have done here.

for loops for regression over multiple variables & outputting a subset

Really hard to give a definitive answer without knowing the structure of your data beforehand, but this might work. I'm assuming that your two data frames have the same number of rows (observations):

df <- cbind( AA[ , 2:4 ] , BB[ , 6:91 ] )
mods <- apply( as.data.frame( df[ , 4:89 ] ) , 2 , FUN = function(x){ lm( x ~ df[,1] + df[,2] + df[,3] } )

# The rows of this matrix will correspond to the intercept, gender, age, race, and the columns are the results for each of your 86 genetic postions
pvals <- sapply( mods , function(x){ summary(x)$coefficients[,4] )

As to whether or not that is the right thing to do I will trust to your judgement as a genetic epidemiologist!

Creating a loop through a list of variables for an LM model in R

You don't even have to use loops. Apply should work nicely.

training_data <- as.data.frame(matrix(sample(1:64), nrow = 8))
colnames(training_data) <- c("independent_variable", paste0("x", 1:7))

Vars <- as.list(c("x1+x2+x3",
                "x1+x2+x4",
                "x1+x2+x5",
                "x1+x2+x6",
                "x1+x2+x7"))

allModelsList <- lapply(paste("independent_variable ~", Vars), as.formula)
allModelsResults <- lapply(allModelsList, function(x) lm(x, data = training_data))

If you need models summaries you can add :

allModelsSummaries = lapply(allModelsResults, summary)

For example you can access the coefficient R² of the model lm(independent_variable ~ x1+x2+x3) by doing this:

allModelsSummaries[[1]]$r.squared

I hope it helps.

Creating loop over columns to calculate regression and then compare best combination of variables

We could change the line of fit with

fit(as.formula(paste(colnames1[i], "~ .")), data = a)

-full function

quant<-function(a){
  
  a <- janitor::clean_names(a)
  colnames1 <- colnames(a)
  lm_model <- linear_reg() %>% 
    set_engine('lm') %>%
    set_mode('regression')
  
  out_lst <- vector('list', ncol(a))
  for (i in seq_along(a)) {
    lm_fit <- lm_model %>% 
      fit(as.formula(paste(colnames1[i], "~ .")), data = a)
    out_lst[[i]]<-tidy(lm_fit)[1,2]
    
  }
  
  out_lst
}

-testing

> dat <- tibble(col1 = 1:5, col2 = 5:1)
> quant(dat)
[[1]]
# A tibble: 1 × 1
  estimate
     <dbl>
1        6

[[2]]
# A tibble: 1 × 1
  estimate
     <dbl>
1        6

Loop over combinations of column names with lavaan syntax

Here's one way of doing it:

fits <- apply(permut, 1, function (p) {
    permuted.df <- df[p]
    colnames(permuted.df) <- names(p)
    sem(mod, permuted.df, se="robust")
})

fits contains the SEM results for every 3-permutation in permut. To see the estimates of, e.g., the first fit, you can proceed as usual:

> parameterestimates(fits[[1]]) %>% filter(op != "~~")
  lhs op     rhs label         est         se          z     pvalue    ci.lower
1   M  ~       X     a -0.18393765 0.10977670 -1.6755618 0.09382406 -0.39909603
2   Y  ~       X     c  0.07314372 0.09891034  0.7394952 0.45960637 -0.12071699
3   Y  ~       M     b  0.01944518 0.08852450  0.2196587 0.82613697 -0.15405965
4 ind :=     a*b   ind -0.00357670 0.01600038 -0.2235385 0.82311644 -0.03493686
5 tot := c+(a*b)   tot  0.06956702 0.09816192  0.7086966 0.47851276 -0.12282680
    ci.upper
1 0.03122074
2 0.26700443
3 0.19295001
4 0.02778346
5 0.26196084

Linear Regression loop for each independent variable individually against dependent

Hi try something like that :

models <- lapply(paste("mpg", names(mtcars)[-1], sep = "~"), formula)
res.models <- lapply(models, FUN = function(x) {summary(lm(formula = x, data = mtcars))})
names(res.models) <- paste("mpg", names(mtcars)[-1], sep = "~")
res.models[["mpg~disp"]]

# Call:
# lm(formula = x, data = mtcars)

# Residuals:
#     Min      1Q  Median      3Q     Max 
# -4.8922 -2.2022 -0.9631  1.6272  7.2305 

# Coefficients:
#              Estimate Std. Error t value Pr(>|t|)    
# (Intercept) 29.599855   1.229720  24.070  < 2e-16 ***
# disp        -0.041215   0.004712  -8.747 9.38e-10 ***
# ---
# Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

# Residual standard error: 3.251 on 30 degrees of freedom
# Multiple R-squared:  0.7183,  Adjusted R-squared:  0.709 
# F-statistic: 76.51 on 1 and 30 DF,  p-value: 9.38e-10

Looping Regression Model grouped by two columns in R using lapply

library(dplyr)
library(tidyr) #nest
library(broom) #tidy
library(purrr) #map

df %>% group_by(Region,Illness_Code) %>% nest() %>% 
      mutate(fit=map(data, ~lm(COUNT~YEAR, data = .)), results = map(fit, tidy)) %>%
      unnest(results)

# A tibble: 6 x 7
Region Illness_Code term         estimate std.error statistic p.value
<fct>  <fct>        <chr>           <dbl>     <dbl>     <dbl>   <dbl>
1 A      ABC          (Intercept)  111984.    51770.     2.16     0.276
2 A      ABC          YEAR            -55.5      25.7   -2.16     0.276
3 B      XYZ          (Intercept)  212804.  3494736.     0.0609   0.961
4 B      XYZ          YEAR           -105.     1734.    -0.0605   0.962
5 C      ABC          (Intercept)  211768.   122153.     1.73     0.333
6 C      ABC          YEAR           -105.       60.6   -1.73     0.333

Using lapply and split

#Identify list elements with nrow greater than one
Ind <- sapply(split(df1, list(df1$Region,df1$Illness_Code)), function(x)nrow(x)>1) 

lapply(
  #Loop only throught list elements wiht nrow>1
  split(df, list(df$Region,df$Illness_Code))[Ind],
  function(x){
    #browser()
    m <- lm(formula = COUNT ~ YEAR, data = x)
    #coef(m)
    as.data.frame(cbind(t(coef(m)), 'Year_2016'=x[x$YEAR==2016,'COUNT']))
  })

By default split(df1, list(df1$Region,df1$Illness_Code)) will generate a list containing all interactions between levels of Region and Illness_Code, but some of these interactions with nrow=0 e.g $B.ABC and $A.XYZ which will cause problems later so we need to remove them using an indicator

Looping Over Combinations of Regression Model Terms