Reshape Multiple Categorical Variables to Binary Response Variables

Reshape multiple categorical variables to binary response variables

How much spice is too much? Here is a solution via tidyr:

library(dplyr)
library(tidyr)

mydata %>%
  gather(actor,name,starts_with("actor")) %>%
  mutate(present = 1) %>%
  select(-actor) %>%
  spread(name,present,fill = 0)

       movie Jack Kate Leo
 1 Departed    1    0   1
 2  Titanic    0    1   1

Reshape dataframe from categorical variables to only binary variables

What you're trying to create are called dummy variables, an in R those are created using model.matrix(). Your specific application is a little special however, so some extra fiddling is required.

dtf <- data.frame(id=20:24, 
                  f=c("a", "b", "c", "a", "b"), 
                  g=c("A", "C", NA, "B", "A"),
                  h=c("P", "R", "Q", NA, "Q"))

# (the first column is not a categorical variable, hence not included)
dtf2 <- dtf[-1]

# Pre-allocate a list of the appropriate length
l <- vector("list", ncol(dtf2))

# Loop over each column in dtf2 and 
for (j in 1:ncol(dtf2)) {
    # Make sure to include NA as a level 
    data <- dtf2[j]
    data[] <- factor(dtf2[,j], exclude=NULL)

    # Generate contrasts that include all levels
    cont <- contrasts(data[[1]], contrasts=FALSE)

    # Create dummy variables using the above contrasts, excluding intercept
    # Formula syntax is the same as in e.g. lm(), except the response
    # variable (term to the left of ~) is not included. 
    # '-1' means no intercept, '.' means all variables
    modmat <- model.matrix(~ -1+., data=data, contrasts.arg=cont)

    # Find rows to fill with NA
    nacols <- grep(".*NA$", colnames(modmat))

    # Only do the operations if an NA-column was found
    if (length(nacols > 0)) {
       narows <- rowSums(modmat[, nacols, drop=FALSE]) > 0
       modmat[narows,] <- NA
       modmat <- modmat[,-nacols]
    }

    l[[j]] <- modmat
}

data.frame(dtf[1], do.call(cbind, l))
#   id fa fb fc gA gB gC hP hQ hR
# 1 20  1  0  0  1  0  0  1  0  0
# 2 21  0  1  0  0  0  1  0  0  1
# 3 22  0  0  1 NA NA NA  0  1  0
# 4 23  1  0  0  0  1  0 NA NA NA
# 5 24  0  1  0  1  0  0  0  1  0

R: Convert binary categorical variables to long data format

We can use do this with tidyverse

library(tidyverse)
mydata %>%
    gather(response, value, cafe:classroom) %>% 
    filter(value==1) %>%
    select(id, response, gender, job)

Gathering multiple dummy variables as one categorical variable in R

If there is always a 1 and it is not repeated in a single row, then use max.col to return the index of the max value in the row and with that index, subset the names of the dataset

df$Category <- names(df)[-1][max.col(df[-1])]
df$Category
#[1] "Groceries"      "Utilities"      "Consumables"    "Transportation" "Entertainment"  "Misc"

Reshape from long to wide and create columns with binary value

Using reshape2 we can dcast from long to wide.

As you only want a binary outcome we can unique the data first

library(reshape2)

si <- unique(studentInfo)
dcast(si, formula = StudentID ~ Subject, fun.aggregate = length)

#  StudentID English History Maths Science
#1         1       1       0     1       1
#2         2       0       0     1       0
#3         3       0       1     0       0

Another approach using tidyr and dplyr is

library(tidyr)
library(dplyr)

studentInfo %>%
  mutate(yesno = 1) %>%
  distinct %>%
  spread(Subject, yesno, fill = 0)

#  StudentID English History Maths Science
#1         1       1       0     1       1
#2         2       0       0     1       0
#3         3       0       1     0       0

Although I'm not a fan (yet) of tidyr syntax...

Reshaping long to wide R with categorical variables

Get the data in long format first :

library(tidyr)

df %>%
  pivot_longer(cols = starts_with('Var')) %>%
  pivot_wider(names_from = c(name, Year, value), values_from = name, 
              values_fn = length, values_fill = 0)

#     ID Var1_1996_A Var2_1996_A Var3_1996_B Var3_1996_C Var1_1996_B Var3_1996_D
#  <int>       <int>       <int>       <int>       <int>       <int>       <int>
#1     1           2           2           1           1           0           0
#2     2           0           1           0           0           1           1
#3     3           0           0           0           0           0           0
# … with 6 more variables: Var1_1998_C <int>, Var2_1998_C <int>,
#   Var3_1998_A <int>, Var1_2000_D <int>, Var2_2000_D <int>, Var3_2000_D <int>

data

df <- structure(list(Year = c(1996L, 1996L, 1996L, 1998L, 2000L), ID = c(1L, 
1L, 2L, 2L, 3L), Var1 = c("A", "A", "B", "C", "D"), Var2 = c("A", 
"A", "A", "C", "D"), Var3 = c("B", "C", "D", "A", "D")), 
class = "data.frame", row.names = c(NA, -5L))

Columns to dummies in R, example provided

Try with reshape2. Like here:

library("reshape2")

dcast(cbind(z, ile=1), 
      uid~language_name, value.var="ile", fill=0) -> t

colnames(t)[-1] <- paste("Language_", colnames(t)[-1], sep="")

Reshape Multiple Categorical Variables to Binary Response Variables