Datatype for Linear Model in R

Datatype for linear model in R

Try running a <- as.numeric(as.character(a)) or a <- as.numeric(levels(a))[a] before the regression. Now a is set up as a factor, and the regression treats is in a way that each value in a is going to be assigned it's own coefficient, thus providing you with a stepwise response instead of a straight line.

Solving a linear model for a known value of y in R

Since you have fitted a low order polynomial in ordinary form (raw = TRUE), you can use polyroot to directly find x given y.

## pc: polynomial coefficients in increasing order
solvePC <- function (pc, y) {
pc[1] <- pc[1] - y
## all roots, including complex ones
roots <- polyroot(pc)
## keep real roots
Re(roots)[abs(Im(roots)) / Mod(roots) < 1e-10]
}

y0 <- 38.9 ## example y-value
x0 <- solvePC(coef(model), y0)
#[1] 34.28348

plot(x, y, col = 8)
lines(x, model$fitted, lwd = 2)
abline(h = y0)
abline(v = x0)

root

To get an interval estimate, we can use sampling methods.

## polyfit: an ordinary polynomial regression model fitted by lm()
rootCI <- function (polyfit, y, nSamples = 1000, level = 0.05) {
## sample regression coefficients from their joint distribution
pc <- MASS::mvrnorm(nSamples, coef(polyfit), vcov(polyfit))
## for each row (a sample), call solvePC()
roots <- apply(pc, 1, solvePC, y)
## confidence interval
quantile(roots, prob = c(0.5 * level, 1 - 0.5 * level))
}

## 95% confidence interval
rootCI(model, y = y0)
# 2.5% 97.5%
#34.17981 34.38828

Running a linear model in R with spreadsheet data

You can't use a linear regression model with a factor as your response variable, which is what you are attempting to do here (type is your response variable). Regression models require numeric response variables. You should instead look at classification models.

As Roland points out, you may wish to start by restating your "type" variable as a logical, binomial variable. Rather than a factor called "type" with two levels "a" and "b", you might create a new variable called "is.type.a", which would contain TRUE or FALSE.

You could then try a logistic regression based on a binomial distribution

model <- glm(is.type.a ~ age + gender,data=data,family="binomial")

Building linear regression model from map_*() in R

If you are just interested in the R2 I don't think you need the map function: You could just group by department and then extract the R2 directly:

attrition %>%
group_by(Department) %>%
mutate(r_squared = summary(lm(MonthlyIncome ~ Age))[['r.squared']])

If you insist on using a map function, you must make sure that you really supply a function:

attrition %>%
group_by(Department) %>%
mutate(lm_summary = list(summary(lm(MonthlyIncome ~ Age)))) %>%
mutate(r_squared = purrr::map_dbl(lm_summary, function(x) x[["r.squared"]]))

Saving a list with different data types

You can use saveRDS() and readRDS() for lists or any other R objects.

# list
l = list(iris,
'string',
lm(Sepal.Length ~ Sepal.Width, data = iris),
TRUE)
# path
fl = file.path(tempdir(), 'file.rds')
# save & read
saveRDS(l, fl)
readRDS(fl)

Linear regression in R: invalid type (list) for variable?

You need to pass only one depended variable to lm. If you want models for each c you could do:

xlm <- apply(X.labels,2,function(xl)lm(xl ~.,data= X.training))
xlm

To get:

> xlm
$c1

Call:
lm(formula = xl ~ ., data = X.training)

Coefficients:
(Intercept) A1 A2 A3 A4 A5
0.050096 0.002525 -0.009387 0.003754 -0.009197 -0.001056
A6
0.017881

$c2

Call:
lm(formula = xl ~ ., data = X.training)

Coefficients:
(Intercept) A1 A2 A3 A4 A5
0.0266587 0.0066861 -0.0007149 -0.0183789 0.0140998 0.0160385
A6
-0.0152220

$c3

Call:
lm(formula = xl ~ ., data = X.training)

Coefficients:
(Intercept) A1 A2 A3 A4 A5
-0.077624 0.001679 0.007541 0.006682 0.002210 -0.005104
A6
-0.002375

R: repeat linear regression for all variables and save results in a new data frame

You can try the following code to have the desired output

data <- structure(list(var1 = c(12L, 3L, 13L, 17L, 9L, 15L, 12L, 3L, 
13L), var2 = c(5L, 2L, 15L, 11L, 13L, 6L, 5L, 2L, 15L), var3 = c(18L,
10L, 14L, 16L, 8L, 20L, 18L, 10L, 14L), var4 = c(19L, 6L, 13L,
18L, 8L, 17L, 19L, 6L, 13L), var5 = c(12L, 13L, 1L, 10L, 7L,
3L, 12L, 13L, 1L), var6 = c(17L, 17L, 17L, 17L, 17L, 17L, 17L,
17L, 17L), var7 = c(11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L
), var8 = c(16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L), var9 = c(18L,
18L, 18L, 18L, 18L, 18L, 18L, 18L, 18L), var10 = c(10L, 10L,
10L, 10L, 10L, 10L, 10L, 10L, 10L)), class = "data.frame", row.names = c(NA,
-9L))

head(data,2)
#> var1 var2 var3 var4 var5 var6 var7 var8 var9 var10
#> 1 12 5 18 19 12 17 11 16 18 10
#> 2 3 2 10 6 13 17 11 16 18 10

x = names(data[,-1])
out <- unlist(lapply(1, function(n) combn(x, 1, FUN=function(row) paste0("var1 ~ ", paste0(row, collapse = "+")))))
out
#> [1] "var1 ~ var2" "var1 ~ var3" "var1 ~ var4" "var1 ~ var5"
#> [5] "var1 ~ var6" "var1 ~ var7" "var1 ~ var8" "var1 ~ var9"
#> [9] "var1 ~ var10"

library(broom)
#> Warning: package 'broom' was built under R version 3.5.3

library(dplyr)
#> Warning: package 'dplyr' was built under R version 3.5.3
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union

#To have the regression coefficients
tmp1 = bind_rows(lapply(out, function(frml) {
a = tidy(lm(frml, data=data))
a$frml = frml
return(a)
}))
head(tmp1)
#> # A tibble: 6 x 6
#> term estimate std.error statistic p.value frml
#> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 (Intercept) 6.46 2.78 2.33 0.0529 var1 ~ var2
#> 2 var2 0.525 0.288 1.82 0.111 var1 ~ var2
#> 3 (Intercept) -1.50 4.47 -0.335 0.748 var1 ~ var3
#> 4 var3 0.863 0.303 2.85 0.0247 var1 ~ var3
#> 5 (Intercept) 0.649 2.60 0.250 0.810 var1 ~ var4
#> 6 var4 0.766 0.183 4.18 0.00413 var1 ~ var4

#To have the regression results i.e. R2, AIC, BIC
tmp2 = bind_rows(lapply(out, function(frml) {
a = glance(lm(frml, data=data))
a$frml = frml
return(a)
}))
head(tmp2)
#> # A tibble: 6 x 12
#> r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
#> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
#> 1 0.321 0.224 4.33 3.31 0.111 2 -24.8 55.7 56.3
#> 2 0.537 0.471 3.58 8.12 0.0247 2 -23.1 52.2 52.8
#> 3 0.714 0.673 2.81 17.5 0.00413 2 -20.9 47.9 48.5
#> 4 0.276 0.173 4.47 2.67 0.146 2 -25.1 56.2 56.8
#> 5 0 0 4.92 NA NA 1 -26.6 57.2 57.6
#> 6 0 0 4.92 NA NA 1 -26.6 57.2 57.6
#> # ... with 3 more variables: deviance <dbl>, df.residual <int>, frml <chr>

write.csv(tmp1, "Try_lm_coefficients.csv")
write.csv(tmp2, "Try_lm_results.csv")

Created on 2019-11-20 by the reprex package (v0.3.0)



Related Topics



Leave a reply



Submit