How to Programmatically Create Binary Columns Based on a Categorical Variable in Data.Table

How to programmatically create binary columns based on a categorical variable in data.table?

data.table has its own dcast implementation using data.table's internals and should be fast. Give this a try:

dcast(dt, id ~ y, fun.aggregate = function(x) 1L, fill=0L)
#    id a b c d e
# 1:  1 0 1 1 1 1
# 2:  2 1 0 1 0 1
# 3:  3 1 0 1 1 1

Just thought of another way to handle this by preallocating and updating by reference (perhaps dcast's logic should be done like this to avoid intermediates).

ans = data.table(id = unique(dt$id))[, unique(dt$y) := 0L][]

All that's left is to fill existing combinations with 1L.

dt[, {set(ans, i=.GRP, j=unique(y), value=1L); NULL}, by=id]
ans
#    id b d c e a
# 1:  1 1 1 1 1 0
# 2:  2 0 0 1 1 1
# 3:  3 0 1 1 1 1

Okay, I've gone ahead on benchmarked on OP's data dimensions with ~10 million rows and 10 columns.

require(data.table)
set.seed(45L)
y = apply(matrix(sample(letters, 10L*20L, TRUE), ncol=20L), 1L, paste, collapse="")
dt = data.table(id=sample(1e5,1e7,TRUE), y=sample(y,1e7,TRUE))

system.time(ans1 <- AnsFunction())   # 2.3s
system.time(ans2 <- dcastFunction()) # 2.2s
system.time(ans3 <- TableFunction()) # 6.2s

setcolorder(ans1, names(ans2))
setcolorder(ans3, names(ans2))
setorder(ans1, id)
setkey(ans2, NULL)
setorder(ans3, id)

identical(ans1, ans2) # TRUE
identical(ans1, ans3) # TRUE

where,

AnsFunction <- function() {
    ans = data.table(id = unique(dt$id))[, unique(dt$y) := 0L][]
    dt[, {set(ans, i=.GRP, j=unique(y), value=1L); NULL}, by=id]
    ans
    # reorder columns outside
}

dcastFunction <- function() {
    # no need to load reshape2. data.table has its own dcast as well
    # no need for setDT
    df <- dcast(dt, id ~ y, fun.aggregate = function(x) 1L, fill=0L,value.var = "y")
}

TableFunction <- function() {
    # need to return integer results for identical results
    # fixed 1 -> 1L; as.numeric -> as.integer
    df <- as.data.frame.matrix(table(dt$id, dt$y))
    df[df > 1L] <- 1L
    df <- cbind(id = as.integer(row.names(df)), df)
    setDT(df)
}

Create a binary variable based on the first appearance of another (date) variable

We can replace the ifelse and also the base R methods. Create the 'first' as 0, then do a join with 'dt2' based on the columns in the post, then assign those matching rows to 1 for 'first', negate (!) the first and convert to integer with (+) or as.integer and assign it to rev

dt[, first := 0][dt2, first := 1, on = .(id, year, analyst, fdate)]
dt[, rev := +(!first)][]
#     analyst id       year      fdate first rev
# 1:       1  1 2009-12-31 2009-07-31     1   0
# 2:       2  1 2009-12-31 2009-10-08     0   1
# 3:       1  1 2010-12-31 2010-05-03     1   0
# 4:       2  1 2010-12-31 2010-10-14     0   1
# 5:       1  2 2009-12-31 2009-02-17     1   0
# 6:       2  2 2009-12-31 2009-02-26     0   1
# 7:       1  2 2010-12-31 2010-07-31     0   1
# 8:       2  2 2010-12-31 2010-04-13     1   0
# 9:       1  3 2009-12-31 2009-10-30     0   1
#10:       2  3 2009-12-31 2009-09-14     1   0
#11:       1  3 2010-12-31 2010-01-31     1   0
#12:       2  3 2010-12-31 2010-11-30     0   1
#13:       1  4 2009-12-31 2009-01-31     1   0
#14:       2  4 2009-12-31 2009-11-02     0   1
#15:       1  4 2010-12-31 2010-08-02     0   1
#16:       2  4 2010-12-31 2010-05-15     1   0
#17:       1  5 2009-12-31 2009-06-30     0   1
#18:       2  5 2009-12-31 2009-06-26     1   0
#19:       1  5 2010-12-31 2010-03-28     1   0
#20:       2  5 2010-12-31 2010-10-03     0   1

How to use cast or another function to create a binary table in R

Original data:

x <- data.frame(id=c(1,1,2,3,3), region=factor(c(2,3,2,1,1)))

> x
  id region
1  1      2
2  1      3
3  2      2
4  3      1
5  3      1

Group up the data:

aggregate(model.matrix(~ region - 1, data=x), x["id"], max)

Result:

  id region1 region2 region3
1  1       0       1       1
2  2       0       1       0
3  3       1       0       0

Create a column of unique identifiers based on another column in data.table

This should be fast, and is at least a bit more straightforward:

myDT[, mrpID:=.GRP, by=Addy]
myDT
    Addy mrpID
1: 12hig     1
2: 12hig     1
3: 12hig     1
4: 198aM     2
5: 1AbHN     3

applying function on data.table having two columns as factors

Here you go:

dt = data.table(user.id = c(2,2,3,1,1,3,2,1), exec.no = c(1,2,2,2,1,2,2,1), job.no = c(1:8))

dt[, list(result = list(list(user.id = user.id,
                             exec.no = exec.no,
                             job.nos = job.no))),
     by = list(user.id, exec.no)][, result]

Fast data.table column split to multiple rows based on delimiter

We can use tstrsplit on the third column to split into multiple columns and assign (:=) the output to column names of interest

data[, paste0("V", 1:3) := tstrsplit(`Peptide IDs`, ";", type.convert = TRUE)]

If we need the 'long' format

library(splitstackshape)
cSplit(data, "Peptide IDs", ";", "long")

Split character column into several binary (0/1) columns

You can try cSplit_e from my "splitstackshape" package:

library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
#          a a_a a_b a_c a_d
# 1:   a,b,c   1   1   1   0
# 2:     a,b   1   1   0   0
# 3: a,b,c,d   1   1   1   1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
#    a_a a_b a_c a_d
# 1:   1   1   1   0
# 2:   1   1   0   0
# 3:   1   1   1   1

There's also mtabulate from "qdapTools":

library(qdapTools)
mtabulate(strsplit(a, ","))
#   a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

A very direct base R approach is to use table along with stack and strsplit:

table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
#    values
# ind a b c d
#   1 1 1 1 0
#   2 1 1 0 0
#   3 1 1 1 1

How to cut a variable to 20 equal segments (for example) for several columns in a dataset in R

Use the g argument in cut2() to choose the breaks your will cut the variable into.

require(data.table)
require(Hmisc)

set.seed(123)
DT <- data.table(x1 = rnorm(10e5, 50, 50),
                 x2 = rnorm(10e5, 30, 50),
                 x3 = rnorm(10e5, 20, 50),
                 x4 = rnorm(10e5, 10, 50),
                 x5 = rnorm(10e5, 10, 50)
)

cut_qt <- DT[,sapply(.SD, function(x) if(is.numeric(x)) cut2(x, g = 4)), ]

print(cut_qt)

head(cut_qt)
x1               x2               x3               x4               x5                
[1,] "[  16.3, 50.0)" "[-199.6, -3.8)" "[ -13.7, 20.0)" "[ -23.8, 10.0)" "[ -23.74,  9.97)"
[2,] "[  16.3, 50.0)" "[  63.6,257.4]" "[  20.0, 53.7)" "[-218.7,-23.8)" "[-222.34,-23.74)"
[3,] "[  83.7,292.5]" "[  -3.8, 29.9)" "[ -13.7, 20.0)" "[  43.7,247.6]" "[ -23.74,  9.97)"
[4,] "[  50.0, 83.7)" "[  63.6,257.4]" "[ -13.7, 20.0)" "[  10.0, 43.7)" "[-222.34,-23.74)"
[5,] "[  50.0, 83.7)" "[  29.9, 63.6)" "[-232.5,-13.7)" "[  10.0, 43.7)" "[-222.34,-23.74)"
[6,] "[  83.7,292.5]" "[  29.9, 63.6)" "[-232.5,-13.7)" "[  43.7,247.6]" "[ -23.74,  9.97)"

As this is slow considering the OP is dealing with a large dataset:

> system.time(DT[,lapply(.SD, function(x) if(is.numeric(x)) cut2(x, g = 4)), ])
   user  system elapsed 
  37.66    0.00   38.70

ALTERNATIVE METHOD USING set()

# 1) Calculate Quantiles
q <- DT[,sapply(.SD, function(x) if(is.numeric(x)) quantile(x)), ]
q
x1          x2         x3         x4          x5
0%   -189.95953 -199.574605 -232.54139 -218.74362 -222.343247
25%    16.28067   -3.797748  -13.72424  -23.76578  -23.736187
50%    49.98701   29.938932   20.01473   10.03740    9.967671
75%    83.66663   63.614604   53.74529   43.73047   43.676887
100%  292.53835  257.368361  280.64704  247.64500  277.418083

# 2) Modify the existing DT with the categorical variables using set

cols_to_fix <- names(DT)

for (j in 1:length(cols_to_fix)){
  column <- cols_to_fix[j] 
  brk = q[,j]
  val = cut2(DT[[column]], cuts = brk)
  set(DT, i=NULL, j=j, value = val)
}

system.time(for (j in 1:length(cols_to_fix)){
  column <- cols_to_fix[j] 
  brk = q[,j]
  val = cut2(DT[[column]], cuts = brk)
  set(DT, i=NULL, j=j, value = val)
}
  )
user  system elapsed 
4.71    0.00    4.83

How to Programmatically Create Binary Columns Based on a Categorical Variable in Data.Table