How to Programmatically Create Binary Columns Based on a Categorical Variable in Data.Table

How to programmatically create binary columns based on a categorical variable in data.table?

data.table has its own dcast implementation using data.table's internals and should be fast. Give this a try:

dcast(dt, id ~ y, fun.aggregate = function(x) 1L, fill=0L)
# id a b c d e
# 1: 1 0 1 1 1 1
# 2: 2 1 0 1 0 1
# 3: 3 1 0 1 1 1

Just thought of another way to handle this by preallocating and updating by reference (perhaps dcast's logic should be done like this to avoid intermediates).

ans = data.table(id = unique(dt$id))[, unique(dt$y) := 0L][]

All that's left is to fill existing combinations with 1L.

dt[, {set(ans, i=.GRP, j=unique(y), value=1L); NULL}, by=id]
ans
# id b d c e a
# 1: 1 1 1 1 1 0
# 2: 2 0 0 1 1 1
# 3: 3 0 1 1 1 1

Okay, I've gone ahead on benchmarked on OP's data dimensions with ~10 million rows and 10 columns.

require(data.table)
set.seed(45L)
y = apply(matrix(sample(letters, 10L*20L, TRUE), ncol=20L), 1L, paste, collapse="")
dt = data.table(id=sample(1e5,1e7,TRUE), y=sample(y,1e7,TRUE))

system.time(ans1 <- AnsFunction()) # 2.3s
system.time(ans2 <- dcastFunction()) # 2.2s
system.time(ans3 <- TableFunction()) # 6.2s

setcolorder(ans1, names(ans2))
setcolorder(ans3, names(ans2))
setorder(ans1, id)
setkey(ans2, NULL)
setorder(ans3, id)

identical(ans1, ans2) # TRUE
identical(ans1, ans3) # TRUE

where,

AnsFunction <- function() {
ans = data.table(id = unique(dt$id))[, unique(dt$y) := 0L][]
dt[, {set(ans, i=.GRP, j=unique(y), value=1L); NULL}, by=id]
ans
# reorder columns outside
}

dcastFunction <- function() {
# no need to load reshape2. data.table has its own dcast as well
# no need for setDT
df <- dcast(dt, id ~ y, fun.aggregate = function(x) 1L, fill=0L,value.var = "y")
}

TableFunction <- function() {
# need to return integer results for identical results
# fixed 1 -> 1L; as.numeric -> as.integer
df <- as.data.frame.matrix(table(dt$id, dt$y))
df[df > 1L] <- 1L
df <- cbind(id = as.integer(row.names(df)), df)
setDT(df)
}

Create a binary variable based on the first appearance of another (date) variable

We can replace the ifelse and also the base R methods. Create the 'first' as 0, then do a join with 'dt2' based on the columns in the post, then assign those matching rows to 1 for 'first', negate (!) the first and convert to integer with (+) or as.integer and assign it to rev

dt[, first := 0][dt2, first := 1, on = .(id, year, analyst, fdate)]
dt[, rev := +(!first)][]
# analyst id year fdate first rev
# 1: 1 1 2009-12-31 2009-07-31 1 0
# 2: 2 1 2009-12-31 2009-10-08 0 1
# 3: 1 1 2010-12-31 2010-05-03 1 0
# 4: 2 1 2010-12-31 2010-10-14 0 1
# 5: 1 2 2009-12-31 2009-02-17 1 0
# 6: 2 2 2009-12-31 2009-02-26 0 1
# 7: 1 2 2010-12-31 2010-07-31 0 1
# 8: 2 2 2010-12-31 2010-04-13 1 0
# 9: 1 3 2009-12-31 2009-10-30 0 1
#10: 2 3 2009-12-31 2009-09-14 1 0
#11: 1 3 2010-12-31 2010-01-31 1 0
#12: 2 3 2010-12-31 2010-11-30 0 1
#13: 1 4 2009-12-31 2009-01-31 1 0
#14: 2 4 2009-12-31 2009-11-02 0 1
#15: 1 4 2010-12-31 2010-08-02 0 1
#16: 2 4 2010-12-31 2010-05-15 1 0
#17: 1 5 2009-12-31 2009-06-30 0 1
#18: 2 5 2009-12-31 2009-06-26 1 0
#19: 1 5 2010-12-31 2010-03-28 1 0
#20: 2 5 2010-12-31 2010-10-03 0 1

How to use cast or another function to create a binary table in R

Original data:

x <- data.frame(id=c(1,1,2,3,3), region=factor(c(2,3,2,1,1)))

> x
id region
1 1 2
2 1 3
3 2 2
4 3 1
5 3 1

Group up the data:

aggregate(model.matrix(~ region - 1, data=x), x["id"], max)

Result:

  id region1 region2 region3
1 1 0 1 1
2 2 0 1 0
3 3 1 0 0

Create a column of unique identifiers based on another column in data.table

This should be fast, and is at least a bit more straightforward:

myDT[, mrpID:=.GRP, by=Addy]
myDT
Addy mrpID
1: 12hig 1
2: 12hig 1
3: 12hig 1
4: 198aM 2
5: 1AbHN 3

applying function on data.table having two columns as factors

Here you go:

dt = data.table(user.id = c(2,2,3,1,1,3,2,1), exec.no = c(1,2,2,2,1,2,2,1), job.no = c(1:8))

dt[, list(result = list(list(user.id = user.id,
exec.no = exec.no,
job.nos = job.no))),
by = list(user.id, exec.no)][, result]

Fast data.table column split to multiple rows based on delimiter

We can use tstrsplit on the third column to split into multiple columns and assign (:=) the output to column names of interest

data[, paste0("V", 1:3) := tstrsplit(`Peptide IDs`, ";", type.convert = TRUE)] 

If we need the 'long' format

library(splitstackshape)
cSplit(data, "Peptide IDs", ";", "long")

Split character column into several binary (0/1) columns

You can try cSplit_e from my "splitstackshape" package:

library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1

There's also mtabulate from "qdapTools":

library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

A very direct base R approach is to use table along with stack and strsplit:

table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

How to cut a variable to 20 equal segments (for example) for several columns in a dataset in R

Use the g argument in cut2() to choose the breaks your will cut the variable into.

require(data.table)
require(Hmisc)

set.seed(123)
DT <- data.table(x1 = rnorm(10e5, 50, 50),
x2 = rnorm(10e5, 30, 50),
x3 = rnorm(10e5, 20, 50),
x4 = rnorm(10e5, 10, 50),
x5 = rnorm(10e5, 10, 50)
)

cut_qt <- DT[,sapply(.SD, function(x) if(is.numeric(x)) cut2(x, g = 4)), ]

print(cut_qt)

head(cut_qt)
x1 x2 x3 x4 x5
[1,] "[ 16.3, 50.0)" "[-199.6, -3.8)" "[ -13.7, 20.0)" "[ -23.8, 10.0)" "[ -23.74, 9.97)"
[2,] "[ 16.3, 50.0)" "[ 63.6,257.4]" "[ 20.0, 53.7)" "[-218.7,-23.8)" "[-222.34,-23.74)"
[3,] "[ 83.7,292.5]" "[ -3.8, 29.9)" "[ -13.7, 20.0)" "[ 43.7,247.6]" "[ -23.74, 9.97)"
[4,] "[ 50.0, 83.7)" "[ 63.6,257.4]" "[ -13.7, 20.0)" "[ 10.0, 43.7)" "[-222.34,-23.74)"
[5,] "[ 50.0, 83.7)" "[ 29.9, 63.6)" "[-232.5,-13.7)" "[ 10.0, 43.7)" "[-222.34,-23.74)"
[6,] "[ 83.7,292.5]" "[ 29.9, 63.6)" "[-232.5,-13.7)" "[ 43.7,247.6]" "[ -23.74, 9.97)"

As this is slow considering the OP is dealing with a large dataset:

> system.time(DT[,lapply(.SD, function(x) if(is.numeric(x)) cut2(x, g = 4)), ])
user system elapsed
37.66 0.00 38.70

ALTERNATIVE METHOD USING set()

# 1) Calculate Quantiles
q <- DT[,sapply(.SD, function(x) if(is.numeric(x)) quantile(x)), ]
q
x1 x2 x3 x4 x5
0% -189.95953 -199.574605 -232.54139 -218.74362 -222.343247
25% 16.28067 -3.797748 -13.72424 -23.76578 -23.736187
50% 49.98701 29.938932 20.01473 10.03740 9.967671
75% 83.66663 63.614604 53.74529 43.73047 43.676887
100% 292.53835 257.368361 280.64704 247.64500 277.418083

# 2) Modify the existing DT with the categorical variables using set

cols_to_fix <- names(DT)

for (j in 1:length(cols_to_fix)){
column <- cols_to_fix[j]
brk = q[,j]
val = cut2(DT[[column]], cuts = brk)
set(DT, i=NULL, j=j, value = val)
}

system.time(for (j in 1:length(cols_to_fix)){
column <- cols_to_fix[j]
brk = q[,j]
val = cut2(DT[[column]], cuts = brk)
set(DT, i=NULL, j=j, value = val)
}
)
user system elapsed
4.71 0.00 4.83


Related Topics



Leave a reply



Submit