How to One-Hot-Encode Factor Variables with Data.Table

How to one-hot-encode factor variables with data.table?

Here you go:

dcast(melt(dt, id.vars='ID'), ID ~ variable + value, fun = length)
#   ID Color_blue Color_green Color_red Shape_cirlce Shape_square Shape_triangle
#1:  1          0           1         0            0            1              0
#2:  2          0           0         1            0            0              1
#3:  3          0           0         1            0            1              0
#4:  4          1           0         0            0            0              1
#5:  5          0           1         0            1            0              0

To get the missing factors you can do the following:

res = dcast(melt(dt, id = 'ID', value.factor = T), ID ~ value, drop = F, fun = length)
setnames(res, c("ID", unlist(lapply(2:ncol(dt),
                             function(i) paste(names(dt)[i], levels(dt[[i]]), sep = "_")))))
res
#   ID Color_blue Color_green Color_red Color_purple Shape_cirlce Shape_square Shape_triangle
#1:  1          0           1         0            0            0            1              0
#2:  2          0           0         1            0            0            0              1
#3:  3          0           0         1            0            0            1              0
#4:  4          1           0         0            0            0            0              1
#5:  5          0           1         0            0            1            0              0

How to turn one-hot encoded variables to a single factor in R

Here's a solution ...

First one hot encode carb

mtcars$carb <- factor(mtcars$carb)
df <- as.data.frame(model.matrix(~ carb - 1, mtcars))
head(df)

#>                   carb1 carb2 carb3 carb4 carb6 carb8
#> Mazda RX4             0     0     0     1     0     0
#> Mazda RX4 Wag         0     0     0     1     0     0
#> Datsun 710            1     0     0     0     0     0
#> Hornet 4 Drive        1     0     0     0     0     0
#> Hornet Sportabout     0     1     0     0     0     0
#> Valiant               1     0     0     0     0     0

We could of course select out the hot encode variables

library(dplyr)

df %>% 
   rowwise() %>% 
   mutate(remade = which.max(c_across(starts_with("carb")))) %>%
   ungroup %>%
   mutate(remade = factor(remade))

#> # A tibble: 32 x 7
#>    carb1 carb2 carb3 carb4 carb6 carb8 remade
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct> 
#>  1     0     0     0     1     0     0 4     
#>  2     0     0     0     1     0     0 4     
#>  3     1     0     0     0     0     0 1     
#>  4     1     0     0     0     0     0 1     
#>  5     0     1     0     0     0     0 2     
#>  6     1     0     0     0     0     0 1     
#>  7     0     0     0     1     0     0 4     
#>  8     0     1     0     0     0     0 2     
#>  9     0     1     0     0     0     0 2     
#> 10     0     0     0     1     0     0 4     
#> # … with 22 more rows

Here it is as a function with the option to keep or delete the one hot encoded columns a la @KM_83

cold_encode <- function(df, encoded_prefix, keep_dummies = FALSE) {
   var <- sym(encoded_prefix)
   df <- 
      df %>%
      rowwise() %>%
      mutate({{ var }} := which.max(c_across(starts_with(encoded_prefix)))) %>%
      ungroup %>%
      mutate({{ var }} := factor({{ var }})) 
   if (!keep_dummies) {
      df <- 
      df %>% select(-matches(paste0(encoded_prefix,1:9)))
   }
   return(df)
}

cold_encode(df, "carb")
#> # A tibble: 32 x 11
#>      mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear carb 
#>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <fct>
#>  1  21       6  160    110  3.9   2.62  16.5     0     1     4 4    
#>  2  21       6  160    110  3.9   2.88  17.0     0     1     4 4    
#>  3  22.8     4  108     93  3.85  2.32  18.6     1     1     4 1    
#>  4  21.4     6  258    110  3.08  3.22  19.4     1     0     3 1    
#>  5  18.7     8  360    175  3.15  3.44  17.0     0     0     3 2    
#>  6  18.1     6  225    105  2.76  3.46  20.2     1     0     3 1    
#>  7  14.3     8  360    245  3.21  3.57  15.8     0     0     3 4    
#>  8  24.4     4  147.    62  3.69  3.19  20       1     0     4 2    
#>  9  22.8     4  141.    95  3.92  3.15  22.9     1     0     4 2    
#> 10  19.2     6  168.   123  3.92  3.44  18.3     1     0     4 4    
#> # … with 22 more rows

one hot encode each column in a Int matrix in R

There are probably more concise ways to do this but this should work (and is at least easy to read and understand ;)

Suggested solution using base R and double loop:

set.seed(4)  
t <- matrix(floor(runif(10, 1,9)),5,5)

# initialize result object
#
t_hot <- NULL

# for each column in original matrix
#
for (col in seq_along(t[1,])) {
  # for each unique value in this column (sorted so the resulting
  # columns appear in order)
  #
  for (val in sort(unique(t[, col]))) {
    t_hot <- cbind(t_hot, ifelse(t[, col] == val, 1, 0))
    # make name for this column
    #
    colnames(t_hot)[ncol(t_hot)] <- paste0(col, "_", val)
  }
}

This returns:

     1_1 1_3 1_5 1_7 2_1 2_3 2_6 2_8 3_1 3_3 3_5 3_7 4_1 4_3 4_6 4_8 5_1 5_3 5_5 5_7
[1,]   0   0   1   0   0   1   0   0   0   0   1   0   0   1   0   0   0   0   1   0
[2,]   1   0   0   0   0   0   1   0   1   0   0   0   0   0   1   0   1   0   0   0
[3,]   0   1   0   0   0   0   0   1   0   1   0   0   0   0   0   1   0   1   0   0
[4,]   0   1   0   0   0   0   0   1   0   1   0   0   0   0   0   1   0   1   0   0
[5,]   0   0   0   1   1   0   0   0   0   0   0   1   1   0   0   0   0   0   0   1

Efficient way to do one-hot encoding in R on large data

Try using mltools::one_hot

require(mltools)
require(data.table)

n <- 1e6

df1 <- data.table( ID= seq(1:n), replicate(99, sample(0:1,n,TRUE)))

one_hot(df1)

No memory issues for me and it runs almost instantly

One hot encoding creating n-1 dummy variables

Here goes a solution performing the full-rank dummification (i.e. creating n-1 columns to avoid co-linearity):

require('caret') 
data.table(ID=DT$ID, predict(dummyVars(ID ~ ., DT, fullRank = T),DT))

This does exactly the job:

   ID colorgreen colorred sizemedium sizesmall
1:  1          0        0          0         0
2:  2          1        0          1         0
3:  3          0        1          0         1

See this for a friendly walkthrough of this function, and ?dummyVars for all the available options.

Also: in a comment, the OP mentioned that this operation would need to be done for millions of rows and thousands of columns, thus justifying the need for data.table. If this simple pre-processing step is too much for the "computing muscle", then I am afraid that the modeling step (aka the real deal) is doomed to fail.

R - How to one hot encoding a single column while keep other columns still?

require(tidyr)
require(dplyr)

df %>% mutate(value = 1)  %>% spread(subject, value,  fill = 0 ) 

group student exam_pass Japanese Math Science
1     A      01         N        0    0       1
2     A      01         Y        1    1       0
3     A      02         N        0    1       0
4     A      02         Y        0    0       1
5     B      01         Y        1    0       0
6     C      02         N        0    1       0

How to One-Hot-Encode Factor Variables with Data.Table