Split string column to create new binary columns
Using mtabuate
from the qdapTools package that I maintain:
library(qdapTools)
mtabulate(strsplit(as.character(dat[[1]]), "/"))
## V1 ca cbr_LBL cni_at.p3x.4 eq2_off eq2_on fe.gr hi.on hi.ov put sent_1 sent_1fe.gr
## 1 1 1 0 0 1 1 1 0 0 1 1 0
## 2 1 1 0 0 1 1 1 1 1 1 1 0
## 3 1 1 0 0 1 1 0 1 1 1 0 1
## 4 1 1 0 1 1 1 1 0 0 1 1 0
## 5 1 1 1 0 1 1 1 0 0 1 1 0
Transform the cell values of a column into new binary / dummy variables
Using data.table
:
ID = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L)
word = c('blue', 'red', 'green', 'yellow', 'blue', 'purple', 'orange', 'green')
library(data.table)
DT = data.table(ID, word)
DT
dcast(DT, ID ~ word, fill = 0L, fun.agg = \(x) ifelse(x == 0, 0, 1))
Output
ID blue green orange purple red yellow
1: 1 1 1 0 0 1 1
2: 2 1 1 1 1 0 0
Split dataframe array column into multiple binary columns [R]
I think the question is rather how convert a list of vectors into a binary matrix/data.frame
Here is a solution
testdf = data.frame('a'=c(1,2,3,4,5),'array_column'=c('a-b-c','b-a','c-d','d-e-e','e-a'),stringsAsFactors = F)
testdf$array_column = strsplit(testdf$array_column,'-')
library('plyr')
# Creates a list of data.frames with 1s for each value observed
binary <- lapply(testdf$array_column, function(x) {
vals <- unique(x)
x <- setNames(rep(1,length(vals)), vals);
do.call(data.frame, as.list(x))
})
# Joins into single data.frame
result <- do.call(rbind.fill, binary)
result[is.na(result)] <- 0
result
# a b c d e
# 1 1 1 1 0 0
# 2 1 1 0 0 0
# 3 0 0 1 1 0
# 4 0 0 0 1 1
# 5 1 0 0 0 1
separate and create dummy variable columns
You can first separate_rows
and then pivot_wider
:
library(dplyr)
library(tidyr)
df %>%
# create row ID:
mutate(row = row_number()) %>%
# separate rows on " /":
separate_rows(colors, sep = ' /') %>%
# pivot dataframe wider:
pivot_wider(names_from = colors, values_from = colors,
values_fn = function(x) 1, values_fill = 0) %>%
# deselect obsolete column:
select(-row)
# A tibble: 3 x 6
item blue ` pink` ` red` pink ` white`
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 1 0 0 0 0
2 2 1 1 1 0 0
3 3 0 0 0 1 1
Split a pandas DataFrame column into OneHot/Binary columns
Here's a complete solution, parsing of the messy output and all:
from ast import literal_eval
import numpy as np
import pandas as pd
# the raw data
d = '''datetime | mood | activities | notes
8/27/2017 | "good" | ["friends", "party", "gaming"] | NaN
8/28/2017 | "meh" | ["work", "friends", "good food"] | "Stuff stuff"
8/29/2017 | "bad" | ["work", "travel"] | "Fell off my bike"'''
# parse the raw data
df = pd.read_csv(pd.compat.StringIO(d), sep='\s*\|\s*', engine='python')
# parse the lists of activities (which are still strings)
acts = df['activities'].apply(literal_eval)
# get the unique activities
actcols = np.unique([a for al in acts for a in al])
# assemble the desired one hot array from the activities
actarr = np.array([np.in1d(actcols, al) for al in acts])
actdf = pd.DataFrame(actarr, columns=actcols)
# stick the dataframe with the one hot array onto the main dataframe
df = pd.concat([df.drop(columns='activities'), actdf], axis=1)
# fancy print
with pd.option_context("display.max_columns", 20, 'display.width', 9999):
print(df)
Output:
datetime mood notes friends gaming good food party travel work
0 8/27/2017 "good" NaN True True False True False False
1 8/28/2017 "meh" "Stuff stuff" True False True False False True
2 8/29/2017 "bad" "Fell off my bike" False False False False True True
Split character column into several binary (0/1) columns
You can try cSplit_e
from my "splitstackshape" package:
library(splitstackshape)
a <- c("a,b,c", "a,b", "a,b,c,d")
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0)
# a a_a a_b a_c a_d
# 1: a,b,c 1 1 1 0
# 2: a,b 1 1 0 0
# 3: a,b,c,d 1 1 1 1
cSplit_e(as.data.table(a), "a", ",", type = "character", fill = 0, drop = TRUE)
# a_a a_b a_c a_d
# 1: 1 1 1 0
# 2: 1 1 0 0
# 3: 1 1 1 1
There's also mtabulate
from "qdapTools":
library(qdapTools)
mtabulate(strsplit(a, ","))
# a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
A very direct base R approach is to use table
along with stack
and strsplit
:
table(rev(stack(setNames(strsplit(a, ",", TRUE), seq_along(a)))))
# values
# ind a b c d
# 1 1 1 1 0
# 2 1 1 0 0
# 3 1 1 1 1
Encode string column as several dummy columns
An option would be to split the string column by /
and use mtabulate
library(qdapTools)
cbind(mtabulate(strsplit(df1$names, "/")), df1['label'])
# A B C D label
#1 1 1 0 0 V
#2 1 0 0 0 W
#3 1 0 1 1 X
#4 0 1 1 0 Y
#5 0 1 0 1 Z
Or in base R
table(stack(setNames(strsplit(df1$names, "/"), df1$label))[2:1])
NO packages used
data
df1 <- structure(list(names = c("A/B", "A", "A/C/D", "B/C", "B/D"),
label = c("V", "W", "X", "Y", "Z")), class = "data.frame",
row.names = c("1", "2", "3", "4", "5"))
Split a column into multiple binary dummy columns
We can use mtabulate
from qdapTools
after splitting (strsplit(..
) the 'features' column.
library(qdapTools)
cbind(sampledf[1],mtabulate(strsplit(as.character(sampledf$features), ':')))
# vin f1 f2 f3 f4 f5
#1 v1 1 1 1 0 0
#2 v2 0 1 0 1 1
#3 v3 1 0 0 1 1
Or we can use cSplit_e
from library(splitstackshape)
library(splitstackshape)
df1 <- cSplit_e(sampledf, 'features', ':', type= 'character', fill=0, drop=TRUE)
names(df1) <- sub('.*_', '', names(df1))
Or using base R
methods, we split
as before, set the names of the list
elements from the strsplit
with 'vin' column, convert to a key/value columns 'data.frame' using stack
, get the table
, transpose and cbind
with the first column of 'sampledf'.
cbind(sampledf[1],
t(table(stack(setNames(strsplit(as.character(sampledf$features), ':'),
sampledf$vin)))))
Split columns at delimiter, fill new cols with values from split
We can do an strsplit
on the 'functionality' column by space and then with mtabulate
get the frequency
library(qdapTools)
mtabulate(strsplit(df1$functionality, " +"))
If the column functionality
is factor
class (based on wrapping of as.data.frame
at the end - default gives stringsAsFactors = TRUE
), then change it to character
class before splitting
mtabulate(strsplit(as.character(df1$functionality), " +"))
# audits investigations monitoring patterns reviews
#1 0 1 1 1 0
#2 1 0 0 1 0
#3 1 0 1 1 1
#4 1 1 1 1 1
Or as the OP was using tidyverse
, we can get the expected output with separate_rows/spread
library(tidyverse)
df1 %>%
rownames_to_column('rn') %>%
separate_rows(functionality) %>%
count(rn, functionality) %>%
spread(functionality, n, fill = 0) %>%
select(-rn)
# A tibble: 4 x 5
# audits investigations monitoring patterns reviews
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 1 1 1 0
#2 1 0 0 1 0
#3 1 0 1 1 1
#4 1 1 1 1 1
Or a base R
option would be to convert the list
of vector
s into a two column data.frame with stack
and get the table
table(stack(setNames(strsplit(as.character(df1$functionality), " +"),
row.names(df1)))[2:1])
# values
#ind audits investigations monitoring patterns reviews
# 1 0 1 1 1 0
# 2 1 0 0 1 0
# 3 1 0 1 1 1
# 4 1 1 1 1 1
data
df1 <- structure(list(functionality = c("monitoring investigations patterns",
"audits patterns", "reviews audits monitoring patterns",
"reviews audits monitoring investigations patterns"
)), class = "data.frame", row.names = c(NA, -4L))
Related Topics
Setting Individual Axis Limits With Facet_Wrap and Scales = "Free" in Ggplot2
Join 3 Columns of Different Lengths in R
Find Complement of a Data Frame (Anti - Join)
How to Combine Multiple Variable Data to a Single Variable Data
R Markdown - Changing Font Size and Font Type in HTML Output
Removing Space Between Numeric Values in R
Choose the Top Five Values from Each Group in R
Counting Unique Values Across Variables (Columns) in R
How to Generate a Histogram for Each Column of My Table
Coerce Multiple Columns to Factors At Once
Change Rows into Columns in R With Values Yes/No (1/0)
Quickly Reading Very Large Tables as Dataframes
How to Find the Statistical Mode
Counting the Number of Elements With the Values of X in a Vector
Convert Categorical Variables to Numeric in R
How to Specifically Order Ggplot2 X Axis Instead of Alphabetical Order