Add New Columns to a Data.Table Containing Many Variables

Add new columns to a data.table containing many variables

I think with a small modification to your last code you can easily do both for as many variables you want

vars <- c("x2", "x3") # <- Choose the variable you want to operate on

d[, paste0(vars, "_", "scale") := lapply(.SD, function(x) scale(x)[, 1]), .SDcols = vars, by = Stock]
d[, paste0(vars, "_", "sum") := lapply(.SD, sum), .SDcols = vars, by = Stock]

##          Time Stock x1 x2 x3   x2_scale   x3_scale x2_sum x3_sum
## 1: 2014-08-22     A 13 14 32 -1.1338934  1.1323092     87     44
## 2: 2014-08-23     A 25 39  9  0.7559289 -0.3701780     87     44
## 3: 2014-08-24     A 18 34  3  0.3779645 -0.7621312     87     44
## 4: 2014-08-22     B 44  8  6 -0.4730162 -0.7258662     59     32
## 5: 2014-08-23     B 49  3 18 -0.6757374  1.1406469     59     32
## 6: 2014-08-24     B 15 48  8  1.1487535 -0.4147807     59     32

For simple functions (that don't need special treatment like scale) you could easily do something like

vars <- c("x2", "x3") # <- Define the variable you want to operate on
funs <- c("min", "max", "mean", "sum") # <- define your function
for(i in funs){
  d[, paste0(vars, "_", i) := lapply(.SD, eval(i)), .SDcols = vars, by = Stock] 
}

Dynamically create new columns in data.table

The last two statements return an error message:

dat[,`:=`(paste0(date_col, "_year", sep="") = substr(noquote(get(date_col))[!is.na(noquote(get(date_col)))],1,4))][]

Error: unexpected '=' in "dat[,`:=`(paste0(date_col, "_year", sep="") ="

dat[,`:=`(noquote(paste0(date_col, "_year", sep="")) = substr(noquote(get(date_col))[!is.na(noquote(get(date_col)))],1,4))][]

Error: unexpected '=' in "dat[,`:=`(noquote(paste0(date_col, "_year", sep="")) ="

The correct syntax for calling the :=() function is:

dat[, `:=`(paste0(date_col, "_year", sep = ""), 
           substr(noquote(get(date_col))[!is.na(noquote(get(date_col)))], 1, 4))][]
dat[, `:=`(noquote(paste0(date_col, "_year", sep = "")), 
           substr(noquote(get(date_col))[!is.na(noquote(get(date_col)))], 1, 4))][]

i.e., replace = by ,.

However, assignment syntax and right hand side are far too complicated.

The order_date column is already of class Date:

str(dat)

Classes ‘data.table’ and 'data.frame':    5 obs. of  3 variables:
 $ one       : int  1 2 3 4 5
 $ two       : int  1 2 3 4 5
 $ order_date: Date, format: "2015-01-01" "2015-02-01" ...
 - attr(*, ".internal.selfref")=<externalptr>

In order to extract the year, year() function can be used (either from the data.table package or the lubridate package whatever is loaded last), so no conversion back to character and extraction of the year string is required:

date_col = "order_date"
dat[, paste0(date_col, "_year") := lapply(.SD, year), .SDcols = date_col][]

   one two order_date order_date_year
1:   1   1 2015-01-01            2015
2:   2   2 2015-02-01            2015
3:   3   3 2015-03-01            2015
4:   4   4 2015-04-01            2015
5:   5   5 2015-05-01            2015

Alternatively,

dat[, paste0(date_col, "_year") := year(get(date_col))][]
dat[, `:=`(paste0(date_col, "_year"), year(get(date_col)))][]

work as well.

Assign multiple columns using := in data.table, by group

This now works in v1.8.3 on R-Forge. Thanks for highlighting it!

x <- data.table(a = 1:3, b = 1:6) 
f <- function(x) {list("hi", "hello")} 
x[ , c("col1", "col2") := f(), by = a][]
#    a b col1  col2
# 1: 1 1   hi hello
# 2: 2 2   hi hello
# 3: 3 3   hi hello
# 4: 1 4   hi hello
# 5: 2 5   hi hello
# 6: 3 6   hi hello

x[ , c("mean", "sum") := list(mean(b), sum(b)), by = a][]
#    a b col1  col2 mean sum
# 1: 1 1   hi hello  2.5   5
# 2: 2 2   hi hello  3.5   7
# 3: 3 3   hi hello  4.5   9
# 4: 1 4   hi hello  2.5   5
# 5: 2 5   hi hello  3.5   7
# 6: 3 6   hi hello  4.5   9 

mynames = c("Name1", "Longer%")
x[ , (mynames) := list(mean(b) * 4, sum(b) * 3), by = a]
#     a b col1  col2 mean sum Name1 Longer%
# 1: 1 1   hi hello  2.5   5    10      15
# 2: 2 2   hi hello  3.5   7    14      21
# 3: 3 3   hi hello  4.5   9    18      27
# 4: 1 4   hi hello  2.5   5    10      15
# 5: 2 5   hi hello  3.5   7    14      21
# 6: 3 6   hi hello  4.5   9    18      27

x[ , get("mynames") := list(mean(b) * 4, sum(b) * 3), by = a][]  # same
#    a b col1  col2 mean sum Name1 Longer%
# 1: 1 1   hi hello  2.5   5    10      15
# 2: 2 2   hi hello  3.5   7    14      21
# 3: 3 3   hi hello  4.5   9    18      27
# 4: 1 4   hi hello  2.5   5    10      15
# 5: 2 5   hi hello  3.5   7    14      21
# 6: 3 6   hi hello  4.5   9    18      27

x[ , eval(mynames) := list(mean(b) * 4, sum(b) * 3), by = a][]   # same
#    a b col1  col2 mean sum Name1 Longer%
# 1: 1 1   hi hello  2.5   5    10      15
# 2: 2 2   hi hello  3.5   7    14      21
# 3: 3 3   hi hello  4.5   9    18      27
# 4: 1 4   hi hello  2.5   5    10      15
# 5: 2 5   hi hello  3.5   7    14      21
# 6: 3 6   hi hello  4.5   9    18      27

Older version using the with argument (we discourage this argument when possible):

x[ , mynames := list(mean(b) * 4, sum(b) * 3), by = a, with = FALSE][] # same
#    a b col1  col2 mean sum Name1 Longer%
# 1: 1 1   hi hello  2.5   5    10      15
# 2: 2 2   hi hello  3.5   7    14      21
# 3: 3 3   hi hello  4.5   9    18      27
# 4: 1 4   hi hello  2.5   5    10      15
# 5: 2 5   hi hello  3.5   7    14      21
# 6: 3 6   hi hello  4.5   9    18      27

R data.table add new column with values from other columns by referencing

With fcase:

cols <- unique(dt$Label)
dt[,newCol:=eval(parse(text=paste('fcase(',paste0("Label=='",cols,"',Col_",cols,collapse=','),')')))][]

    Label Col_A Col_B Col_C newCol
   <char> <num> <num> <num>  <num>
1:      A     2     1     2      2
2:      B     3     4     0      4
3:      C     5     3     4      4
4:      A     0     5     1      0
5:      B     2     2     5      2
6:      C     7     0     6      6
7:      A     6     7     7      6
8:      B     8     5     3      5
9:      C     9     8     0      0

assign new column using a variable in data.table

Consolidating some of the comments into an answer here...

From ?set, you can find that the overhead of calling [.data.table repeatedly can add up. In those cases, you can try set instead.

Also, any set* functions should be followed by [] to print the output.

With that, here are the two alternatives:

copy1 <- copy2 <- copy3 <- input_allele[,c(1:6)]
new <- colnames(input_allele[,.SD,.SDcols=c(7:11)])

## Using `set` :

for (i in new) {
  set(copy1, j = i, value = 0)[]
}
head(copy1)
##     FID   IID PAT MAT SEX PHENOTYPE SNP1 SNP2 SNP3 SNP4 SNP5
## 1: gid1 IID11   1   0   1         1    0    0    0    0    0
## 2: gid2 IID12   2   0   1         1    0    0    0    0    0
## 3: gid3 IID13   3   0   1         1    0    0    0    0    0
## 4: gid4 IID14   4   0   1         1    0    0    0    0    0
## 5: gid5 IID15   5   0   1         1    0    0    0    0    0
## 6: gid6 IID16   6   0   1         1    0    0    0    0    0
   
## Using `:=` :

for (i in new) {
  copy2[, (i) := 0][]
}
head(copy2)
##     FID   IID PAT MAT SEX PHENOTYPE SNP1 SNP2 SNP3 SNP4 SNP5
## 1: gid1 IID11   1   0   1         1    0    0    0    0    0
## 2: gid2 IID12   2   0   1         1    0    0    0    0    0
## 3: gid3 IID13   3   0   1         1    0    0    0    0    0
## 4: gid4 IID14   4   0   1         1    0    0    0    0    0
## 5: gid5 IID15   5   0   1         1    0    0    0    0    0
## 6: gid6 IID16   6   0   1         1    0    0    0    0    0

You could also avoid the loop:

copy3[, (new) := as.list(rep(0, length(new)))][]
##       FID   IID PAT MAT SEX PHENOTYPE SNP1 SNP2 SNP3 SNP4 SNP5
##  1:  gid1 IID11   1   0   1         1    0    0    0    0    0
##  2:  gid2 IID12   2   0   1         1    0    0    0    0    0
##  3:  gid3 IID13   3   0   1         1    0    0    0    0    0
##  4:  gid4 IID14   4   0   1         1    0    0    0    0    0
##  5:  gid5 IID15   5   0   1         1    0    0    0    0    0
##  6:  gid6 IID16   6   0   1         1    0    0    0    0    0
##  7:  gid7 IID17   7   0   1         1    0    0    0    0    0
##  8:  gid8 IID18   8   0   1         1    0    0    0    0    0
##  9:  gid9 IID19   9   0   1         1    0    0    0    0    0
## 10: gid10 IID20  10   0   1         1    0    0    0    0    0

Note that quote and eval are not needed for these.

Even with this small dataset, the performance difference between set and using := in a loop is measurable:

fun1 <- function() { for (i in new) { set(copy1, j = i, value = 0)[] }; copy1 }
fun2 <- function() { for (i in new) { copy2[, (i) := 0][] } ; copy2 }
fun3 <- function() copy3[, (new) := as.list(rep(0, length(new)))][]

bench::mark(fun1(), fun2(), fun3())
## # A tibble: 3 x 13
##   expression     min   median `itr/sec` mem_alloc `gc/sec` n_itr  n_gc
##   <bch:expr> <bch:t> <bch:tm>     <dbl> <bch:byt>    <dbl> <int> <dbl>
## 1 fun1()      64.9µs  69.63µs    13932.        0B     4.17  6689     2
## 2 fun2()       993µs   1.07ms      910.   377.6KB     4.23   430     2
## 3 fun3()     241.9µs 255.12µs     3793.    16.4KB     4.30  1763     2
## # … with 5 more variables: total_time <bch:tm>, result <list>, memory <list>,
## #   time <list>, gc <list>

Create multiple variables in data.table based other variables names

This first part is mostly defensive, guarding against: a* variables without matching b* variables; vice versa; and different order of each:

anames <- grep("^a[0-9]+$", colnames(DT), value = TRUE)
bnames <- grep("^b[0-9]+$", colnames(DT), value = TRUE)
numnames <- gsub("^a", "", anames)
anames <- sort(anames[gsub("^a", "", anames) %in% numnames])
bnames <- sort(bnames[gsub("^b", "", bnames) %in% numnames])
cnames <- gsub("^b", "c", bnames)

If you know the number ranges a priori and want something less-dynamic but more straight-forward, then

anames <- paste0("a", 1:3)
bnames <- paste0("b", 1:3)
cnames <- paste0("c", 1:3)

Now the magic:

DT[, (cnames) := Map(`+`, mget(anames), mget(bnames)) ]
DT
#    a1 a2 a3 b1 b2 b3 c1 c2 c3
# 1:  1  2  3  5 20 25  6 22 28
# 2:  2  4  6  6 24 30  8 28 36
# 3:  3  6  9  7 28 35 10 34 44

data.table adding column to several data tables in loop

The best way is to store your objects in a list and then access them by indexing the list.

library(data.table)

my_dts <- list (
  centro = data.table(x=letters[1:5], y = 1:5),
  sur = data.table(x=letters[2:7], y = 3:8),
  norte = data.table(x=letters[2:10], y = 3:11)
)

st <- c("centro", "sur", "norte")
for (i in st) my_dts[[i]][, store := i]

Add New Columns to a Data.Table Containing Many Variables