Split String Column to Create New Binary Columns

Split string column to create new binary columns

Using mtabuate from the qdapTools package that I maintain:

library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))

## V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1 1 1 0 0 1 1 1 0 0 1 1 0
## 2 1 1 0 0 1 1 1 1 1 1 1 0
## 3 1 1 0 0 1 1 0 1 1 1 0 1
## 4 1 1 0 1 1 1 1 0 0 1 1 0
## 5 1 1 1 0 1 1 1 0 0 1 1 0

Transform the cell values of a column into new binary / dummy variables

Using data.table:

ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L)

word = c('blue', 'red', 'green', 'yellow', 'blue', 'purple', 'orange', 'green')

library(data.table)

DT = data.table(ID, word)

DT

dcast(DT, ID ~ word, fill = 0L, fun.agg = \(x) ifelse(x == 0, 0, 1))

Output

   ID blue green orange purple red yellow
1: 1 1 1 0 0 1 1
2: 2 1 1 1 1 0 0

Split dataframe array column into multiple binary columns [R]

I think the question is rather how convert a list of vectors into a binary matrix/data.frame

Here is a solution

testdf = data.frame('a'=c(1,2,3,4,5),'array_column'=c('a-b-c','b-a','c-d','d-e-e','e-a'),stringsAsFactors = F)                     
testdf$array_column = strsplit(testdf$array_column,'-')

library('plyr')

# Creates a list of data.frames with 1s for each value observed
binary <- lapply(testdf$array_column, function(x) {
vals <- unique(x)
x <- setNames(rep(1,length(vals)), vals);
do.call(data.frame, as.list(x))
})

# Joins into single data.frame
result <- do.call(rbind.fill, binary)
result[is.na(result)] <- 0

result
# a b c d e
# 1 1 1 1 0 0
# 2 1 1 0 0 0
# 3 0 0 1 1 0
# 4 0 0 0 1 1
# 5 1 0 0 0 1

separate and create dummy variable columns

You can first separate_rows and then pivot_wider:

library(dplyr)
library(tidyr)
df %>%
# create row ID:
mutate(row = row_number()) %>%
# separate rows on " /":
separate_rows(colors, sep = ' /') %>%
# pivot dataframe wider:
pivot_wider(names_from = colors, values_from = colors,
values_fn = function(x) 1, values_fill = 0) %>%
# deselect obsolete column:
select(-row)
# A tibble: 3 x 6
item blue ` pink` ` red` pink ` white`
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 0 0
2 2 1 1 1 0 0
3 3 0 0 0 1 1

Split a pandas DataFrame column into OneHot/Binary columns

Here's a complete solution, parsing of the messy output and all:

from ast import literal_eval
import numpy as np
import pandas as pd

# the raw data

d = '''datetime | mood | activities | notes

8/27/2017 | "good" | ["friends", "party", "gaming"] | NaN

8/28/2017 | "meh" | ["work", "friends", "good food"] | "Stuff stuff"

8/29/2017 | "bad" | ["work", "travel"] | "Fell off my bike"'''

# parse the raw data
df = pd.read_csv(pd.compat.StringIO(d), sep='\s*\|\s*', engine='python')

# parse the lists of activities (which are still strings)
acts = df['activities'].apply(literal_eval)

# get the unique activities
actcols = np.unique([a for al in acts for a in al])

# assemble the desired one hot array from the activities
actarr = np.array([np.in1d(actcols, al) for al in acts])
actdf = pd.DataFrame(actarr, columns=actcols)

# stick the dataframe with the one hot array onto the main dataframe
df = pd.concat([df.drop(columns='activities'), actdf], axis=1)

# fancy print
with pd.option_context("display.max_columns", 20, 'display.width', 9999):
print(df)

Output:

    datetime    mood               notes  friends  gaming  good food  party  travel   work
0 8/27/2017 "good" NaN True True False True False False
1 8/28/2017 "meh" "Stuff stuff" True False True False False True
2 8/29/2017 "bad" "Fell off my bike" False False False False True True

Split character column into several binary (0/1) columns

You can try cSplit_e from my "splitstackshape" package:

library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1

There's also mtabulate from "qdapTools":

library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

A very direct base R approach is to use table along with stack and strsplit:

table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1

Encode string column as several dummy columns

An option would be to split the string column by / and use mtabulate

library(qdapTools)
cbind(mtabulate(strsplit(df1$names, "/")), df1['label'])
# A B C D label
#1 1 1 0 0 V
#2 1 0 0 0 W
#3 1 0 1 1 X
#4 0 1 1 0 Y
#5 0 1 0 1 Z

Or in base R

table(stack(setNames(strsplit(df1$names, "/"), df1$label))[2:1])

NO packages used

data

df1 <- structure(list(names = c("A/B", "A", "A/C/D", "B/C", "B/D"), 
label = c("V", "W", "X", "Y", "Z")), class = "data.frame",
row.names = c("1", "2", "3", "4", "5"))

Split a column into multiple binary dummy columns

We can use mtabulate from qdapTools after splitting (strsplit(..) the 'features' column.

library(qdapTools)
cbind(sampledf[1],mtabulate(strsplit(as.character(sampledf$features), ':')))
# vin f1 f2 f3 f4 f5
#1 v1 1 1 1 0 0
#2 v2 0 1 0 1 1
#3 v3 1 0 0 1 1

Or we can use cSplit_e from library(splitstackshape)

library(splitstackshape)
df1 <- cSplit_e(sampledf, 'features', ':', type= 'character', fill=0, drop=TRUE)
names(df1) <- sub('.*_', '', names(df1))

Or using base R methods, we split as before, set the names of the list elements from the strsplit with 'vin' column, convert to a key/value columns 'data.frame' using stack, get the table, transpose and cbind with the first column of 'sampledf'.

cbind(sampledf[1],  
t(table(stack(setNames(strsplit(as.character(sampledf$features), ':'),
sampledf$vin)))))

Split columns at delimiter, fill new cols with values from split

We can do an strsplit on the 'functionality' column by space and then with mtabulate get the frequency

library(qdapTools)
mtabulate(strsplit(df1$functionality, " +"))

If the column functionality is factor class (based on wrapping of as.data.frame at the end - default gives stringsAsFactors = TRUE), then change it to character class before splitting

mtabulate(strsplit(as.character(df1$functionality), " +"))
# audits investigations monitoring patterns reviews
#1 0 1 1 1 0
#2 1 0 0 1 0
#3 1 0 1 1 1
#4 1 1 1 1 1

Or as the OP was using tidyverse, we can get the expected output with separate_rows/spread

library(tidyverse)
df1 %>%
rownames_to_column('rn') %>%
separate_rows(functionality) %>%
count(rn, functionality) %>%
spread(functionality, n, fill = 0) %>%
select(-rn)
# A tibble: 4 x 5
# audits investigations monitoring patterns reviews
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 1 1 1 0
#2 1 0 0 1 0
#3 1 0 1 1 1
#4 1 1 1 1 1

Or a base R option would be to convert the list of vectors into a two column data.frame with stack and get the table

table(stack(setNames(strsplit(as.character(df1$functionality), " +"), 
row.names(df1)))[2:1])
# values
#ind audits investigations monitoring patterns reviews
# 1 0 1 1 1 0
# 2 1 0 0 1 0
# 3 1 0 1 1 1
# 4 1 1 1 1 1

data

df1 <- structure(list(functionality = c("monitoring investigations patterns", 
"audits patterns", "reviews audits monitoring patterns",
"reviews audits monitoring investigations patterns"
)), class = "data.frame", row.names = c(NA, -4L))


Related Topics



Leave a reply



Submit