Reshape Multiple Categorical Variables to Binary Response Variables

Reshape multiple categorical variables to binary response variables

How much spice is too much? Here is a solution via tidyr:

library(dplyr)
library(tidyr)

mydata %>%
gather(actor,name,starts_with("actor")) %>%
mutate(present = 1) %>%
select(-actor) %>%
spread(name,present,fill = 0)

movie Jack Kate Leo
1 Departed 1 0 1
2 Titanic 0 1 1

Reshape dataframe from categorical variables to only binary variables

What you're trying to create are called dummy variables, an in R those are created using model.matrix(). Your specific application is a little special however, so some extra fiddling is required.

dtf <- data.frame(id=20:24, 
f=c("a", "b", "c", "a", "b"),
g=c("A", "C", NA, "B", "A"),
h=c("P", "R", "Q", NA, "Q"))

# (the first column is not a categorical variable, hence not included)
dtf2 <- dtf[-1]

# Pre-allocate a list of the appropriate length
l <- vector("list", ncol(dtf2))

# Loop over each column in dtf2 and
for (j in 1:ncol(dtf2)) {
# Make sure to include NA as a level
data <- dtf2[j]
data[] <- factor(dtf2[,j], exclude=NULL)

# Generate contrasts that include all levels
cont <- contrasts(data[[1]], contrasts=FALSE)

# Create dummy variables using the above contrasts, excluding intercept
# Formula syntax is the same as in e.g. lm(), except the response
# variable (term to the left of ~) is not included.
# '-1' means no intercept, '.' means all variables
modmat <- model.matrix(~ -1+., data=data, contrasts.arg=cont)

# Find rows to fill with NA
nacols <- grep(".*NA$", colnames(modmat))

# Only do the operations if an NA-column was found
if (length(nacols > 0)) {
narows <- rowSums(modmat[, nacols, drop=FALSE]) > 0
modmat[narows,] <- NA
modmat <- modmat[,-nacols]
}

l[[j]] <- modmat
}

data.frame(dtf[1], do.call(cbind, l))
# id fa fb fc gA gB gC hP hQ hR
# 1 20 1 0 0 1 0 0 1 0 0
# 2 21 0 1 0 0 0 1 0 0 1
# 3 22 0 0 1 NA NA NA 0 1 0
# 4 23 1 0 0 0 1 0 NA NA NA
# 5 24 0 1 0 1 0 0 0 1 0

R: Convert binary categorical variables to long data format

We can use do this with tidyverse

library(tidyverse)
mydata %>%
gather(response, value, cafe:classroom) %>%
filter(value==1) %>%
select(id, response, gender, job)

Gathering multiple dummy variables as one categorical variable in R

If there is always a 1 and it is not repeated in a single row, then use max.col to return the index of the max value in the row and with that index, subset the names of the dataset

df$Category <- names(df)[-1][max.col(df[-1])]
df$Category
#[1] "Groceries" "Utilities" "Consumables" "Transportation" "Entertainment" "Misc"

Reshape from long to wide and create columns with binary value

Using reshape2 we can dcast from long to wide.

As you only want a binary outcome we can unique the data first

library(reshape2)

si <- unique(studentInfo)
dcast(si, formula = StudentID ~ Subject, fun.aggregate = length)

# StudentID English History Maths Science
#1 1 1 0 1 1
#2 2 0 0 1 0
#3 3 0 1 0 0

Another approach using tidyr and dplyr is

library(tidyr)
library(dplyr)

studentInfo %>%
mutate(yesno = 1) %>%
distinct %>%
spread(Subject, yesno, fill = 0)

# StudentID English History Maths Science
#1 1 1 0 1 1
#2 2 0 0 1 0
#3 3 0 1 0 0

Although I'm not a fan (yet) of tidyr syntax...

Reshaping long to wide R with categorical variables

Get the data in long format first :

library(tidyr)

df %>%
pivot_longer(cols = starts_with('Var')) %>%
pivot_wider(names_from = c(name, Year, value), values_from = name,
values_fn = length, values_fill = 0)

# ID Var1_1996_A Var2_1996_A Var3_1996_B Var3_1996_C Var1_1996_B Var3_1996_D
# <int> <int> <int> <int> <int> <int> <int>
#1 1 2 2 1 1 0 0
#2 2 0 1 0 0 1 1
#3 3 0 0 0 0 0 0
# … with 6 more variables: Var1_1998_C <int>, Var2_1998_C <int>,
# Var3_1998_A <int>, Var1_2000_D <int>, Var2_2000_D <int>, Var3_2000_D <int>

data

df <- structure(list(Year = c(1996L, 1996L, 1996L, 1998L, 2000L), ID = c(1L, 
1L, 2L, 2L, 3L), Var1 = c("A", "A", "B", "C", "D"), Var2 = c("A",
"A", "A", "C", "D"), Var3 = c("B", "C", "D", "A", "D")),
class = "data.frame", row.names = c(NA, -5L))

Columns to dummies in R, example provided

Try with reshape2. Like here:

library("reshape2")

dcast(cbind(z, ile=1),
uid~language_name, value.var="ile", fill=0) -> t

colnames(t)[-1] <- paste("Language_", colnames(t)[-1], sep="")


Related Topics



Leave a reply



Submit