Efficiently computing a linear combination of data.table columns
This is almost 2x faster for me than your manual version:
Reduce("+", lapply(names(DT), function(x) DT[[x]] * cf[x]))
benchmark(manual = DT[, list(cf['A']*A+cf['B']*B+cf['C']*C+cf['D']*D)],
reduce = Reduce('+', lapply(names(DT), function(x) DT[[x]] * cf[x])))
# test replications elapsed relative user.self sys.self user.child sys.child
#1 manual 100 1.43 1.744 1.08 0.36 NA NA
#2 reduce 100 0.82 1.000 0.58 0.24 NA NA
And to iterate over just mycols
, replace names(DT)
with mycols
in lapply
.
How to compute the linear combination of different columns within R data.table
One option is
DT[ sum := Reduce(`+`, DT[, 1:4] * DT[, 5:8])]
Or using .SD
DT[, sum := Reduce(`+`, .SD[, 1:4] * .SD[, 5:8])]
Or we can do
nm1 <- names(DT)[1:4]
nm2 <- paste0("coef", nm1)
DT[, sum := Reduce(`+`, Map(`*`, mget(nm1), mget(nm2)))]
Multiplying column names/factors by each row
Something like this?
df$WtAvg = with(df, A + 2*B + 3*C)
df
ID A B C WtAvg
1 1 0.2 0.4 0.4 2.2
2 2 0.3 0.3 0.4 2.1
3 3 0.6 0.2 0.2 1.6
R data.table dot product with matching column names (for each group)
Join the data.tables:
dt[coefs, res := b * i.b + d * i.d, on = "a"]
# a b c d res
#1: x 0.09901786 -0.362080111 -0.5108862 -1.0217723
#2: x -0.16128422 0.169655945 0.3199648 0.6399295
#3: x -0.79648896 -0.502279345 1.3828633 2.7657266
#4: x -0.26121421 0.480548972 -1.1559392 -2.3118783
#5: x 0.54085591 -0.601323442 1.3833795 2.7667590
#6: y 0.83662761 0.607666970 0.6320762 2.7328562
#7: y -1.92510391 -0.050515610 -0.3176544 -2.8780671
#8: y 1.65639926 -0.167090105 0.6830158 3.7054466
#9: y 1.48772354 -0.349713539 -1.2736467 -2.3332166
#10: y 1.49065993 0.008198885 -0.1923361 0.9136516
Usually you would use the matrix product here, but that would mean that you had to coerce the respective subset to a matrix. That would result in a copy being made and since data.tables are mainly used for larger data, you want to avoid copies.
If you need dynamic column names, the most simple solution that comes to mind is actually an eval
/parse
construct:
cols = colnames(coefs)[-1]
expr <- parse(text = paste(paste(cols, paste0("i.", cols), sep = "*"), collapse = "+"))
#expression(b*i.b+d*i.d)
dt[coefs, res := eval(expr), on = "a"]
Maybe someone else can suggest a better solution.
Here is a solution using matrix multiplication:
dt[, res := as.matrix(.SD) %*% unlist(coefs[a == .BY, .SD, .SDcols = cols]),
by = "a", .SDcols = cols]
Of course this makes copies, which is potentially less efficient then the eval
solution.
Computing squared residual in regression row-wise in a data.table
Assuming y
is the first column of your data table dat
and the rest of the columns are predictors. This works for bonus 1.
mat = as.matrix(dat[, x1:x3, with = F])
pred = cbind(1, mat) %*% beta
dat[, rss := (pred - y)^2]
For bonus 2:
dat[, mean_by_grp := mean(rss), by = grp]
To avoid the matrix conversion, you could do this:
dat[, pred := beta[1] + beta[2] * x1 + beta[3] * x2 + beta[4] * x3]
writing out the inner product.
Complete reproducible example
set.seed(47)
dat = data.table(replicate(4, rnorm(5)))
setnames(dat, c("y", paste0("x", 1:3)))
dat[, grp := c("A", "A", "B", "B", "B")]
beta = 1:4
mat = as.matrix(dat[, x1:x3, with = F])
pred = cbind(1, mat) %*% beta
dat[, rss := (pred - y) ^ 2]
dat[, mean_by_grp := mean(rss), by = grp]
dat
# y x1 x2 x3 grp rss mean_by_grp
# 1: 1.9946963 -1.08573747 -0.92245624 0.67077922 A 10.565250 7.064164
# 2: 0.7111425 -0.98548216 0.03960243 -0.08107805 A 3.563078 7.064164
# 3: 0.1854053 0.01513086 0.49382018 1.26424109 B 54.512843 38.263204
# 4: -0.2817650 -0.25204590 -1.82822917 -0.70338819 B 56.558929 38.263204
# 5: 0.1087755 -1.46575030 0.09147291 -0.04057817 B 3.717840 38.263204
How to find linear regression between groups using data.table?
With the new data, we could split
the data by 'group' into a list
. Then, use combn
on the names
of the list
for pairwise combination, extract the list
elements (s1
, s2
), check if there are any
common 'time' (intersect
). Use a condition based on length
i.e. if there are common elements, then apply the lm
on the corresponding 'value' columns, create a data.table with summarised coef
along with the group names and rbind
the list
elements
library(data.table)
lst1 <- split(dt, dt$group)
rbindlist(combn(names(lst1), 2, FUN = function(x) {
s1 <- lst1[[x[1]]]
s2 <- lst1[[x[2]]]
i1 <- intersect(s1$time, s2$time)
if(length(i1) > 0) na.omit(s1[s2, on = .(time)][,
. (group1 = first(s1$group), group2 = first(s2$group),
regression = lm(i.value ~ value)$coef[2])])
else
data.table(group1 = first(s1$group), group2 = first(s2$group),
regression = NA_real_)}, simplify = FALSE))
-output
group1 group2 regression
1: a b 0.03033996
2: a c 0.06391242
3: a d -0.09138112
4: a e -0.27738183
5: b c 0.05663270
6: b d 0.05481604
7: b e 0.27789495
8: c d -0.13987978
9: c e 0.16388299
10: d e 0.12380720
If we want full combinations, use either expand.grid
or CJ
(from data.table
dt2 <- CJ(group1 = names(lst1), group2 = names(lst1))[group1 != group2]
dt2[, rbindlist(Map(function(x, y) {
s1 <- lst1[[x]]
s2 <- lst1[[y]]
i1 <- intersect(s1$time, s2$time)
if(length(i1) > 0) na.omit(s1[s2, on = .(time)][,
data.table(group1 = x, group2 = y,
regresion = lm(i.value ~ value)$coef[2])]) else
data.table(group1 = x, group2 = y, regression = NA_real_)
}, group1, group2))]
-output
group1 group2 regresion
1: a b 0.03033996
2: a c 0.06391242
3: a d -0.09138112
4: a e -0.27738183
5: b a 0.03247826
6: b c 0.05663270
7: b d 0.05481604
8: b e 0.27789495
9: c a 0.07488082
10: c b 0.06198333
11: c d -0.13987978
12: c e 0.16388299
13: d a -0.09295215
14: d b 0.05208743
15: d c -0.12144302
16: d e 0.12380720
17: e a -0.25136439
18: e b 0.34052322
19: e c 0.28677255
20: e d 0.21435666
data.table sumproduct style vector multiplication
Using matrix multiplication:
coeffs <- as.vector(c(2,3,4))
dt2 <- DT[,Calc := as.matrix(DT[,..vars])%*%coeffs]
Create new variable that is linear combination of many other variables
You can try
df1$newvar <- as.matrix(df1) %*% v1
Or
df1$newvar <- rowSums(sweep(df1, 2, v1, FUN='*'))
Or as suggested by @Frank based on the post
df1$newvar <- Reduce(`+`,lapply(seq_along(v1),function(i)df1[[i]]*v1[i]))
data
set.seed(24)
df1 <- as.data.frame(matrix( rnorm(100*50,mean=0,sd=1), 100, 50))
set.seed(48)
v1 <- runif(50)
R creating a comprehensive table of correlation between combinations of columns
Do you expect such kind of matrix?
df <- structure(list(INDEX = 1:6, TARGET_WINS = c(39L, 70L, 86L, 70L,
82L, 75L), TEAM_BATTING_H = c(1445L, 1339L, 1377L, 1387L, 1297L,
1279L), TEAM_BATTING_2B = c(194L, 219L, 232L, 209L, 186L, 200L
), TEAM_BATTING_3B = c(39L, 22L, 35L, 38L, 27L, 36L), TEAM_BATTING_HR = c(13L,
190L, 137L, 96L, 102L, 92L), TEAM_BATTING_BB = c(143L, 685L,
602L, 451L, 472L, 443L), TEAM_BATTING_SO = c(842L, 1075L, 917L,
922L, 920L, 973L), TEAM_BASERUN_SB = c(NA, 37L, 46L, 43L, 49L,
107L), TEAM_BASERUN_CS = c(NA, 28L, 27L, 30L, 39L, 59L), TEAM_BATTING_HBP = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_
), TEAM_PITCHING_H = c(9364L, 1347L, 1377L, 1396L, 1297L, 1279L
), TEAM_PITCHING_HR = c(84L, 191L, 137L, 97L, 102L, 92L), TEAM_PITCHING_BB = c(927L,
689L, 602L, 454L, 472L, 443L), TEAM_PITCHING_SO = c(5456L, 1082L,
917L, 928L, 920L, 973L), TEAM_FIELDING_E = c(1011L, 193L, 175L,
164L, 138L, 123L), TEAM_FIELDING_DP = c(NA, 155L, 153L, 156L,
168L, 149L)), row.names = c(NA, 6L), class = "data.frame")
# install.packages("corrr")
library(corrr)
df1 <- corrr::correlate(df, method = "pearson")
# 1. Output:
# A tibble: 17 x 18
term INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 INDEX NA 0.642 -0.820 -0.291 0.0236 0.0826 0.205
2 TARG~ 0.642 NA -0.685 0.367 -0.373 0.673 0.788
3 TEAM~ -0.820 -0.685 NA 0.192 0.496 -0.449 -0.502
4 TEAM~ -0.291 0.367 0.192 NA -0.0789 0.640 0.653
5 TEAM~ 0.0236 -0.373 0.496 -0.0789 NA -0.752 -0.676
6 TEAM~ 0.0826 0.673 -0.449 0.640 -0.752 NA 0.984
7 TEAM~ 0.205 0.788 -0.502 0.653 -0.676 0.984 NA
8 TEAM~ 0.134 0.401 -0.560 0.377 -0.754 0.864 0.799
9 TEAM~ 0.790 -0.00267 -0.690 -0.356 0.413 -0.528 -0.541
10 TEAM~ 0.874 -0.0332 -0.834 -0.598 0.261 -0.578 -0.623
11 TEAM~ NA NA NA NA NA NA NA
12 TEAM~ -0.662 -0.923 0.733 -0.358 0.448 -0.771 -0.852
13 TEAM~ -0.352 0.308 -0.127 0.661 -0.767 0.891 0.809
14 TEAM~ -0.914 -0.793 0.736 0.0225 0.0863 -0.341 -0.464
15 TEAM~ -0.667 -0.930 0.719 -0.360 0.424 -0.757 -0.842
16 TEAM~ -0.707 -0.925 0.757 -0.314 0.418 -0.733 -0.820
17 TEAM~ 0.0666 0.265 -0.144 -0.583 -0.447 -0.123 -0.150
Related Topics
How to Stack Error Bars in a Stacked Bar Plot Using Geom_Errorbar
How to Find Out Which Package Version Is Loaded in R
Accept Http Request in R Shiny Application
Dplyr Issues When Using Group_By(Multiple Variables)
Creating Regular 15-Minute Time-Series from Irregular Time-Series
How 'Poly()' Generates Orthogonal Polynomials? How to Understand the "Coefs" Returned
Reason Behind Speed of Fread in Data.Table Package in R
How to Plot Multiple Stacked Histograms Together in R
How to Assign a Value Using If-Else Conditions in R
Suggestions for Speeding Up Random Forests