Split a String Column into Several Dummy Variables

Split a string column into several dummy variables

UPDATE : VERSION 3

Found even faster way. This function is also highly memory efficient.
Primary reason previous function was slow because of copy/assignments happening inside lapply loop as well as rbinding of the result.

In following version, we preallocate matrix with appropriate size, and then change values at appropriate coordinates, which makes it very fast compared to other looping versions.

funcGT3 <- function() {
#Get list of column names in result
resCol <- unique(dt[, unlist(strsplit(messy_string, split="\\$"))])

#Get dimension of result
nresCol <- length(resCol)
nresRow <- nrow(dt)

#Create empty matrix with dimensions same as desired result
mat <- matrix(rep(0, nresRow * nresCol), nrow = nresRow, dimnames = list(as.character(1:nresRow), resCol))

#split each messy_string by $
ll <- strsplit(dt[,messy_string], split="\\$")

#Get coordinates of mat which we need to set to 1
coords <- do.call(rbind, lapply(1:length(ll), function(i) cbind(rep(i, length(ll[[i]])), ll[[i]] )))

#Set mat to 1 at appropriate coordinates
mat[coords] <- 1

#Bind the mat to original data.table
return(cbind(dt, mat))

}

result <- funcGT3() #result for 1000 rows in dt
result
ID messy_string zn tc sv db yx st ze qs wq oe cv ut is kh kk im le qg rq po wd kc un ft ye if zl zt wy et rg iu
1: 1 zn$tc$sv$db$yx 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
2: 2 st$ze$qs$wq 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3: 3 oe$cv$ut$is 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4: 4 kh$kk$im$le$qg 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5: 5 rq$po$wd$kc 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 0 0 0 0
---
996: 996 rp$cr$tb$sa 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
997: 997 cz$wy$rj$he 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
998: 998 cl$rr$bm 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
999: 999 sx$hq$zy$zd 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1000: 1000 bw$cw$pw$rq 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0

Benchmark againt version 2 suggested by Ricardo (this is for 250K rows in data) :

Unit: seconds
expr min lq median uq max neval
GT2 104.68672 104.68672 104.68672 104.68672 104.68672 1
GT3 15.15321 15.15321 15.15321 15.15321 15.15321 1

VERSION 1
Following is version 1 of suggested answer

set.seed(10)  
elements_list <- c(outer(letters, letters, FUN = paste, sep = ""))
random_string <- function(min_length, max_length, separator) {
selection <- paste(sample(elements_list, ceiling(runif(1, min_length, max_length))), collapse = separator)
return(selection)
}
dt <- data.table(ID = c(1:1000), messy_string = "")
dt[ , messy_string := random_string(2, 5, "$"), by = ID]

myFunc <- function() {
ll <- strsplit(dt[,messy_string], split="\\$")

COLS <- do.call(rbind,
lapply(1:length(ll),
function(i) {
data.frame(
ID= rep(i, length(ll[[i]])),
COL = ll[[i]],
VAL= rep(1, length(ll[[i]]))
)
}
)
)

res <- as.data.table(tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length ))
dt <- cbind(dt, res)
for (j in names(dt))
set(dt,which(is.na(dt[[j]])),j,0)
return(dt)
}

create_indicators <- function(search_list, searched_string) {
y <- rep(0, length(search_list))
for(j in 1:length(search_list)) {
x <- regexpr(search_list[j], searched_string)
x <- x[1]
y[j] <- ifelse(x > 0, 1, 0)
}
return(y)
}
OPFunc <- function() {
indicators <- matrix(0, nrow = nrow(dt), ncol = length(elements_list))
for(n in 1:nrow(dt)) {
indicators[n, ] <- dt[n, create_indicators(elements_list, messy_string)]
}
indicators <- data.table(indicators)
setnames(indicators, elements_list)
dt <- cbind(dt, indicators)
return(dt)
}

library(plyr)
plyrFunc <- function() {
indicators = do.call(rbind.fill, sapply(1:dim(dt)[1], function(i)
dt[i,
data.frame(t(as.matrix(table(strsplit(messy_string,
split = "\\$")))))
]))
dt = cbind(dt, indicators)
#dt[is.na(dt)] = 0 #THIS DOESN'T WORK. USING FOLLOWING INSTEAD

for (j in names(dt))
set(dt,which(is.na(dt[[j]])),j,0)

return(dt)
}

BENCHMARK

system.time(res <- myFunc())
## user system elapsed
## 1.01 0.00 1.01

system.time(res2 <- OPFunc())
## user system elapsed
## 21.58 0.00 21.61

system.time(res3 <- plyrFunc())
## user system elapsed
## 1.81 0.00 1.81

VERSION 2 : Suggested by Ricardo

I'm posting this here instead of in my answer as the framework is really @GeekTrader's -Rick_

  myFunc.modified <- function() {
ll <- strsplit(dt[,messy_string], split="\\$")

## MODIFICATIONS:
# using `rbindlist` instead of `do.call(rbind.. )`
COLS <- rbindlist( lapply(1:length(ll),
function(i) {
data.frame(
ID= rep(i, length(ll[[i]])),
COL = ll[[i]],
VAL= rep(1, length(ll[[i]])),
# MODICIATION: Not coercing to factors
stringsAsFactors = FALSE
)
}
)
)

# MODIFICATION: Preserve as matrix, the output of tapply
res2 <- tapply(COLS$VAL, list(COLS$ID, COLS$COL), FUN = length )

# FLATTEN into a data.table
resdt <- data.table(r=c(res2))

# FIND & REPLACE NA's of single column
resdt[is.na(r), r:=0L]

# cbind with dt, a matrix, with the same attributes as `res2`
cbind(dt,
matrix(resdt[[1]], ncol=ncol(res2), byrow=FALSE, dimnames=dimnames(res2)))
}

### Benchmarks:

orig = quote({dt <- copy(masterDT); myFunc()})
modified = quote({dt <- copy(masterDT); myFunc.modified()})
microbenchmark(Modified = eval(modified), Orig = eval(orig), times=20L)

# Unit: milliseconds
# expr min lq median uq max
# 1 Modified 895.025 971.0117 1011.216 1189.599 2476.972
# 2 Orig 1953.638 2009.1838 2106.412 2230.326 2356.802

Encode string column as several dummy columns

An option would be to split the string column by / and use mtabulate

library(qdapTools)
cbind(mtabulate(strsplit(df1$names, "/")), df1['label'])
# A B C D label
#1 1 1 0 0 V
#2 1 0 0 0 W
#3 1 0 1 1 X
#4 0 1 1 0 Y
#5 0 1 0 1 Z

Or in base R

table(stack(setNames(strsplit(df1$names, "/"), df1$label))[2:1])

NO packages used

data

df1 <- structure(list(names = c("A/B", "A", "A/C/D", "B/C", "B/D"), 
label = c("V", "W", "X", "Y", "Z")), class = "data.frame",
row.names = c("1", "2", "3", "4", "5"))

Split a column into multiple binary dummy columns

We can use mtabulate from qdapTools after splitting (strsplit(..) the 'features' column.

library(qdapTools)
cbind(sampledf[1],mtabulate(strsplit(as.character(sampledf$features), ':')))
# vin f1 f2 f3 f4 f5
#1 v1 1 1 1 0 0
#2 v2 0 1 0 1 1
#3 v3 1 0 0 1 1

Or we can use cSplit_e from library(splitstackshape)

library(splitstackshape)
df1 <- cSplit_e(sampledf, 'features', ':', type= 'character', fill=0, drop=TRUE)
names(df1) <- sub('.*_', '', names(df1))

Or using base R methods, we split as before, set the names of the list elements from the strsplit with 'vin' column, convert to a key/value columns 'data.frame' using stack, get the table, transpose and cbind with the first column of 'sampledf'.

cbind(sampledf[1],  
t(table(stack(setNames(strsplit(as.character(sampledf$features), ':'),
sampledf$vin)))))

Splitting a String with Delimiters into Dummy Variables

The trick is to use tidyr::separate_rows() to move your data to a longer format.

Once all your answers are extracted, it is easy to pivot it back to a wide format with tidyr::pivot_wider()

library(tidyverse)
d <- tibble::tribble(
~Person, ~Answer,
"Matt", "A;B;C;",
"Sandy", "B;D;",
"Charles", "A;C;D;"
)

d |>
tidyr::separate_rows(Answer, sep = ";") |>
filter(Answer != "") |>
mutate(value = 1) |>
pivot_wider(
names_from = Answer,
values_from = value,
values_fill = 0
)
#> # A tibble: 3 x 5
#> Person A B C D
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Matt 1 1 1 0
#> 2 Sandy 0 1 0 1
#> 3 Charles 1 0 1 1

Created on 2022-06-15 by the reprex package (v2.0.1)

separate and create dummy variable columns

You can first separate_rows and then pivot_wider:

library(dplyr)
library(tidyr)
df %>%
# create row ID:
mutate(row = row_number()) %>%
# separate rows on " /":
separate_rows(colors, sep = ' /') %>%
# pivot dataframe wider:
pivot_wider(names_from = colors, values_from = colors,
values_fn = function(x) 1, values_fill = 0) %>%
# deselect obsolete column:
select(-row)
# A tibble: 3 x 6
item blue ` pink` ` red` pink ` white`
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 0 0
2 2 1 1 1 0 0
3 3 0 0 0 1 1

Split character column into several binary (0/1) columns

You can try cSplit_e from my "splitstackshape" package:

library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1

There's also mtabulate from "qdapTools":

library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

A very direct base R approach is to use table along with stack and strsplit:

table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

Create several dummy variables from one string variable

Since your concatenated data are concatenated character strings (not concatenated numerical values) you'll need to add type = "character" to get the function to work as you expect it.

The function's default setting is for numeric values, hence the error about NaN and so on.

The naming has been made more consistent with the short forms of the other functions in the same family. Thus, it is now cSplit_e (though the old function name would still work).

library(splitstackshape)
cSplit_e(profs, "teaches", ",", type = "character", fill = 0)
# teaches teaches_1st teaches_2nd teaches_3rd
# 1 1st 1 0 0
# 2 1st, 2nd 1 1 0
# 3 2nd, 3rd 0 1 1
# 4 1st, 2nd, 3rd 1 1 1

The help page for ?concat.split.expanded is the same as that of cSplit_e. If you have any tips on making it clearer to understand, please raise an issue at the package's GitHub page.

Converting pandas column of comma-separated strings into dummy variables

Use str.get_dummies

df['col'].str.get_dummies(sep=',')

a b c d
0 1 0 0 0
1 1 1 1 0
2 1 1 0 1
3 0 0 0 1
4 0 0 1 1

Edit: Updating the answer to address some questions.

Qn 1: Why is it that the series method get_dummies does not accept the argument prefix=... while pandas.get_dummies() does accept it

Series.str.get_dummies is a series level method (as the name suggests!). We are one hot encoding values in one Series (or a DataFrame column) and hence there is no need to use prefix. Pandas.get_dummies on the other hand can one hot encode multiple columns. In which case, the prefix parameter works as an identifier of the original column.

If you want to apply prefix to str.get_dummies, you can always use DataFrame.add_prefix

df['col'].str.get_dummies(sep=',').add_prefix('col_')

Qn 2: If you have more than one column to begin with, how do you merge the dummies back into the original frame?
You can use DataFrame.concat to merge one hot encoded columns with the rest of the columns in dataframe.

df = pd.DataFrame({'other':['x','y','x','x','q'],'col':['a','a,b,c','a,b,d','d','c,d']})
df = pd.concat([df, df['col'].str.get_dummies(sep=',')], axis = 1).drop('col', 1)

other a b c d
0 x 1 0 0 0
1 y 1 1 1 0
2 x 1 1 0 1
3 x 0 0 0 1
4 q 0 0 1 1


Related Topics



Leave a reply



Submit