Creating Dummy Variables in R Data.Table

Creating compound/interacted dummy variables in data.table in R

Is this what you're looking for:

age.brackets <- c(0,seq(35,55, by=10), seq(60,95, by=5), Inf) #age ranges
ranges <- (cut(demo$agef, age.brackets))
split(demo, demo$sex)
spread <- table(demo$agef, ranges) #identify persons in each range
male.spread <- (demo$sex=='1')*as.matrix(spread)
female.spread <- (demo$sex=='2')*as.matrix(spread)

newdt <- data.table(
cbind(
demo,
matrix(as.vector(male.spread), ncol=ncol(male.spread)),
matrix(as.vector(female.spread), ncol=ncol(female.spread))
)
)


#column names
names(newdt) <- c(names(demo),
levels(cut(demo$agef, age.brackets)),
levels(cut(demo$agef, age.brackets))
)
female.names <- gsub('.(\\d*),(\\d*|Inf).', 'F\\1_\\2', levels(cut(demo$agef, age.brackets)))
male.names <- gsub('.(\\d*),(\\d*|Inf).', 'M\\1_\\2', levels(cut(demo$agef, age.brackets)))
names(newdt) <- c(names(demo), female.names, male.names)


newdt

# id sex agef F0_35 F35_45 F45_55 F55_60 F60_65 F65_70 F70_75 F75_80 F80_85 F85_90
# 1: 1 1 43 0 1 0 0 0 0 0 0 0 0
# 2: 2 2 53 0 0 0 0 0 0 0 0 0 0
# 3: 3 1 63 0 0 0 0 1 0 0 0 0 0
# 4: 4 2 73 0 0 0 0 0 0 0 0 0 0
# 5: 5 2 83 0 0 0 0 0 0 0 0 0 0
# 6: 6 2 103 0 0 0 0 0 0 0 0 0 0
# F90_95 F95_Inf M0_35 M35_45 M45_55 M55_60 M60_65 M65_70 M70_75 M75_80 M80_85 M85_90
# 1: 0 0 0 0 0 0 0 0 0 0 0 0
# 2: 0 0 0 0 1 0 0 0 0 0 0 0
# 3: 0 0 0 0 0 0 0 0 0 0 0 0
# 4: 0 0 0 0 0 0 0 0 1 0 0 0
# 5: 0 0 0 0 0 0 0 0 0 0 1 0
# 6: 0 0 0 0 0 0 0 0 0 0 0 0
# M90_95 M95_Inf
# 1: 0 0
# 2: 0 0
# 3: 0 0
# 4: 0 0
# 5: 0 0
# 6: 0 1

Table for dummies in R

I am afraid I can only do it with loops. There might well be a more elegant way to do this in R but at least it works.

I don't know what you want to put in the cells when using v1-3 as rows and w1-3 as columns. You might put the times the values were the same or the times the values were 1. Or any other formula. In the example below, I tried two different ways (one is commented out)

dta = data.frame(v1=c(1,0,0,1,0,0),
v2=c(0,1,0,0,1,0),
v3=c(0,0,1,0,0,1),
w1=c(0,0,1,0,0,1),
w2=c(1,0,0,1,0,0),
w3=c(0,1,0,0,1,0))


t = matrix(NA,nrow=3,ncol=3)
colnames(t)=names(dta[4:6])
rownames(t)=names(dta[1:3])
for(r in rownames(t)){
for(c in colnames(t)){
t[r,c]=sum(dta[[r]]==dta[[c]]) ## Agreement
#t[r,c]=sum(dta[[r]]==1 & dta[[c]]==1) ## Both are 1
}
}

The script first creates the table and then loops through all cells to fill them in with the correct value, taking information from the original dataset.

For agreement, this matrix would result:

> print(t)
w1 w2 w3
v1 2 6 2
v2 2 2 6
v3 6 2 2

This means that, for example, v1 and w2 agree in 6 cases, while the agreement of v1 with w1 is only 2.

How to add dummy variables in R for a large data set

We can just use table

as.data.frame.matrix(table(df1))
# A B C D
#1 1 1 0 0
#3 0 0 1 0
#4 0 0 0 1
#5 0 0 0 2

Or an efficient approach would be dcast from data.table

library(data.table)
dcast(setDT(df1), a~b, value.var = "a", length)

data

df1 <- structure(list(a = c(1L, 1L, 3L, 4L, 5L, 5L), b = c("A", "B", 
"C", "D", "D", "D")), .Names = c("a", "b"), row.names = c("1",
"2", "3", "4", "5", "6"), class = "data.frame")

Loop over data.frame columns to generate dummy variable in R

dt[, 69:135] == 1 will return TRUE if the value in column 69:135 is 1 and FALSE otherwise.

dt[, 178:244] == 2 will return TRUE if the value in column 178:244 is 2 and FALSE otherwise.

You can perform an AND (&) operation between them to compare them elementwise meaning dt[, 69] & dt[, 178], dt[, 70] & dt[, 179] and so on. Take rowwise sum of them and mark it as 'Yes' even if a single TRUE is found in that row.

dt$left_region <- ifelse(rowSums(dt[, 69:135] == 1 & dt[, 178:244] == 2) > 0, 'yes', 'no')

How to summarize data by-group, by creating dummy variables as the collapsing method

I took a stub myself, using functions from the collapse package.

library(magrittr)
library(collapse)
library(data.table)

my_flights_raw %>%
collapse::funique() %>%
collapse::fgroup_by(carrier, month) %>%
collapse::fsummarise(nested_dest = list(dest)) %>%
collapse::ftransform(new_col = lapply(nested_dest, \(x) my_flights_top_dest_across_months %in% x)) %>%
collapse::fcompute(., data.table::transpose(new_col), keep = 1:2) %>%
setNames(c("carrier", "month", my_flights_top_dest_across_months)) %>%
collapse::qTBL()

Unsurprisingly, collapse gives the fastest execution time. But I was surprised to see that @ThomasIsCoding's solution based on data.table was slower than my original tidyverse mix-and-match solution.

I also factored in the single data.table dependency in Thomas's answer, compared to the variety of dependencies in my original method.

library(nycflights13)
library(dplyr, warn.conflicts = FALSE)

# OP original
my_flights_raw <-
flights %>%
select(carrier, month, dest)

my_flights_agg <-
my_flights_raw %>%
count(month, dest, name = "n_obs") %>%
arrange(month, desc(n_obs))

my_flights_top_dest_across_months <-
my_flights_agg %>%
group_by(month) %>%
slice_max(order_by = n_obs, n = 5) %>%
pull(dest) %>%
unique()

my_flights_top_5_by_month <-
my_flights_agg %>%
group_by(month) %>%
slice_max(order_by = n_obs, n = 5)

my_flights_top_dest_across_month <- unique(my_flights_top_5_by_month$dest)


op_slow <- function() {
library(tidyr)
library(tibble)
library(purrr)

my_flights_raw %>%
group_by(carrier, month) %>%
summarise(destinations_vec = list(unique(dest))) %>%
add_column(top_dest = list(my_flights_top_dest_across_month)) %>%
mutate(are_top_dest_included = purrr::map2(.x = destinations_vec, .y = top_dest, .f = ~ .y %in% .x ), .keep = "unused") %>%
mutate(across(are_top_dest_included, ~purrr::map(.x = ., .f = ~setNames(object = .x, nm = my_flights_top_dest_across_month)) )) %>%
tidyr::unnest_wider(are_top_dest_included)
}


# OP collapse
op_collapse <- function() {
library(magrittr)
library(collapse)
library(data.table)

my_flights_raw %>%
collapse::funique() %>%
collapse::fgroup_by(carrier, month) %>%
collapse::fsummarise(nested_dest = list(dest)) %>%
collapse::ftransform(new_col = lapply(nested_dest, \(x) my_flights_top_dest_across_months %in% x)) %>%
collapse::fcompute(., data.table::transpose(new_col), keep = 1:2) %>%
setNames(c("carrier", "month", my_flights_top_dest_across_months)) %>%
collapse::qTBL()
}


# Thomas data.table
thomas_data.table <- function() {
library(data.table)

my_flights_top_dest_across_months <-
data.table(
dest = unique(my_flights_top_5_by_month$dest),
fd = 1
)

dcast(my_flights_top_dest_across_months[
setDT(my_flights_raw),
on = .(dest)
],
carrier + month ~ dest,
fun.aggregate = function(x) sum(x) > 0
)[, c(
"carrier", "month",
my_flights_top_dest_across_months$dest
), with = FALSE]
}

output_op_slow <- op_slow()
output_op_collapse <- op_collapse()
output_thomas <- thomas_data.table()
#> Using 'month' as value column. Use 'value.var' to override

waldo::compare(output_op_slow, output_op_collapse, ignore_attr = TRUE)
#> v No differences
waldo::compare(output_op_slow, as_tibble(output_thomas), ignore_attr = TRUE)
#> v No differences

bm <- bench::mark(op_slow = op_slow(),
op_collapse = op_collapse(),
thomas_dt = thomas_data.table(),
check = FALSE,
iterations = 100)

ggplot2::autoplot(bm)

Sample Image

Speed up this loop to create dummy columns with data.table and set in R

Here's a different approach that, performs better - on my machine - than the original approach in the question

1) Get unique days except Sunday

Day <- setdiff(dt$Week_Day, "Sunday")

2) Initialize new columns with 0:

dt[, (Day) := 0L]

3) Update with 1s by reference in a loop:

for(x in Day) {
set(dt, i = which(dt[["Week_Day"]] == x), j = x, value = 1L)
}

Simple performance comparison:

dt1 <- data.table(Week_Day = sample(c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"), 3e5, TRUE))

dt2 <- copy(dt1)


system.time({
Day <- setdiff(unique(dt$Week_Day), "Sunday")
dt1[, (Day) := 0L]
for(x in Day) {
set(dt1, i = which(dt1[["Week_Day"]] == x), j = x, value = 1L)
}
})
# User System verstrichen
# 0.029 0.003 0.032

system.time({
Day <- unique(dt$Week_Day)
for (i in 1:length(Day)) {
if (Day[i] != "Sunday") {
dt2[, Day[i] := ifelse(Week_Day == Day[i], 1L, 0L)]
}
}
})

# User System verstrichen
# 0.138 0.070 0.210


all.equal(dt1, dt2)
#[1] TRUE


Related Topics



Leave a reply



Submit