Creating compound/interacted dummy variables in data.table in R
Is this what you're looking for:
age.brackets <- c(0,seq(35,55, by=10), seq(60,95, by=5), Inf) #age ranges
ranges <- (cut(demo$agef, age.brackets))
split(demo, demo$sex)
spread <- table(demo$agef, ranges) #identify persons in each range
male.spread <- (demo$sex=='1')*as.matrix(spread)
female.spread <- (demo$sex=='2')*as.matrix(spread)
newdt <- data.table(
cbind(
demo,
matrix(as.vector(male.spread), ncol=ncol(male.spread)),
matrix(as.vector(female.spread), ncol=ncol(female.spread))
)
)
#column names
names(newdt) <- c(names(demo),
levels(cut(demo$agef, age.brackets)),
levels(cut(demo$agef, age.brackets))
)
female.names <- gsub('.(\\d*),(\\d*|Inf).', 'F\\1_\\2', levels(cut(demo$agef, age.brackets)))
male.names <- gsub('.(\\d*),(\\d*|Inf).', 'M\\1_\\2', levels(cut(demo$agef, age.brackets)))
names(newdt) <- c(names(demo), female.names, male.names)
newdt
# id sex agef F0_35 F35_45 F45_55 F55_60 F60_65 F65_70 F70_75 F75_80 F80_85 F85_90
# 1: 1 1 43 0 1 0 0 0 0 0 0 0 0
# 2: 2 2 53 0 0 0 0 0 0 0 0 0 0
# 3: 3 1 63 0 0 0 0 1 0 0 0 0 0
# 4: 4 2 73 0 0 0 0 0 0 0 0 0 0
# 5: 5 2 83 0 0 0 0 0 0 0 0 0 0
# 6: 6 2 103 0 0 0 0 0 0 0 0 0 0
# F90_95 F95_Inf M0_35 M35_45 M45_55 M55_60 M60_65 M65_70 M70_75 M75_80 M80_85 M85_90
# 1: 0 0 0 0 0 0 0 0 0 0 0 0
# 2: 0 0 0 0 1 0 0 0 0 0 0 0
# 3: 0 0 0 0 0 0 0 0 0 0 0 0
# 4: 0 0 0 0 0 0 0 0 1 0 0 0
# 5: 0 0 0 0 0 0 0 0 0 0 1 0
# 6: 0 0 0 0 0 0 0 0 0 0 0 0
# M90_95 M95_Inf
# 1: 0 0
# 2: 0 0
# 3: 0 0
# 4: 0 0
# 5: 0 0
# 6: 0 1
Table for dummies in R
I am afraid I can only do it with loops. There might well be a more elegant way to do this in R but at least it works.
I don't know what you want to put in the cells when using v1-3 as rows and w1-3 as columns. You might put the times the values were the same or the times the values were 1. Or any other formula. In the example below, I tried two different ways (one is commented out)
dta = data.frame(v1=c(1,0,0,1,0,0),
v2=c(0,1,0,0,1,0),
v3=c(0,0,1,0,0,1),
w1=c(0,0,1,0,0,1),
w2=c(1,0,0,1,0,0),
w3=c(0,1,0,0,1,0))
t = matrix(NA,nrow=3,ncol=3)
colnames(t)=names(dta[4:6])
rownames(t)=names(dta[1:3])
for(r in rownames(t)){
for(c in colnames(t)){
t[r,c]=sum(dta[[r]]==dta[[c]]) ## Agreement
#t[r,c]=sum(dta[[r]]==1 & dta[[c]]==1) ## Both are 1
}
}
The script first creates the table and then loops through all cells to fill them in with the correct value, taking information from the original dataset.
For agreement, this matrix would result:
> print(t)
w1 w2 w3
v1 2 6 2
v2 2 2 6
v3 6 2 2
This means that, for example, v1 and w2 agree in 6 cases, while the agreement of v1 with w1 is only 2.
How to add dummy variables in R for a large data set
We can just use table
as.data.frame.matrix(table(df1))
# A B C D
#1 1 1 0 0
#3 0 0 1 0
#4 0 0 0 1
#5 0 0 0 2
Or an efficient approach would be dcast
from data.table
library(data.table)
dcast(setDT(df1), a~b, value.var = "a", length)
data
df1 <- structure(list(a = c(1L, 1L, 3L, 4L, 5L, 5L), b = c("A", "B",
"C", "D", "D", "D")), .Names = c("a", "b"), row.names = c("1",
"2", "3", "4", "5", "6"), class = "data.frame")
Loop over data.frame columns to generate dummy variable in R
dt[, 69:135] == 1
will return TRUE
if the value in column 69:135 is 1 and FALSE
otherwise.
dt[, 178:244] == 2
will return TRUE
if the value in column 178:244 is 2 and FALSE
otherwise.
You can perform an AND (&
) operation between them to compare them elementwise meaning dt[, 69] & dt[, 178]
, dt[, 70] & dt[, 179]
and so on. Take rowwise sum of them and mark it as 'Yes'
even if a single TRUE
is found in that row.
dt$left_region <- ifelse(rowSums(dt[, 69:135] == 1 & dt[, 178:244] == 2) > 0, 'yes', 'no')
How to summarize data by-group, by creating dummy variables as the collapsing method
I took a stub myself, using functions from the collapse
package.
library(magrittr)
library(collapse)
library(data.table)
my_flights_raw %>%
collapse::funique() %>%
collapse::fgroup_by(carrier, month) %>%
collapse::fsummarise(nested_dest = list(dest)) %>%
collapse::ftransform(new_col = lapply(nested_dest, \(x) my_flights_top_dest_across_months %in% x)) %>%
collapse::fcompute(., data.table::transpose(new_col), keep = 1:2) %>%
setNames(c("carrier", "month", my_flights_top_dest_across_months)) %>%
collapse::qTBL()
Unsurprisingly, collapse
gives the fastest execution time. But I was surprised to see that @ThomasIsCoding's solution based on data.table
was slower than my original tidyverse
mix-and-match solution.
I also factored in the single data.table
dependency in Thomas's answer, compared to the variety of dependencies in my original method.
library(nycflights13)
library(dplyr, warn.conflicts = FALSE)
# OP original
my_flights_raw <-
flights %>%
select(carrier, month, dest)
my_flights_agg <-
my_flights_raw %>%
count(month, dest, name = "n_obs") %>%
arrange(month, desc(n_obs))
my_flights_top_dest_across_months <-
my_flights_agg %>%
group_by(month) %>%
slice_max(order_by = n_obs, n = 5) %>%
pull(dest) %>%
unique()
my_flights_top_5_by_month <-
my_flights_agg %>%
group_by(month) %>%
slice_max(order_by = n_obs, n = 5)
my_flights_top_dest_across_month <- unique(my_flights_top_5_by_month$dest)
op_slow <- function() {
library(tidyr)
library(tibble)
library(purrr)
my_flights_raw %>%
group_by(carrier, month) %>%
summarise(destinations_vec = list(unique(dest))) %>%
add_column(top_dest = list(my_flights_top_dest_across_month)) %>%
mutate(are_top_dest_included = purrr::map2(.x = destinations_vec, .y = top_dest, .f = ~ .y %in% .x ), .keep = "unused") %>%
mutate(across(are_top_dest_included, ~purrr::map(.x = ., .f = ~setNames(object = .x, nm = my_flights_top_dest_across_month)) )) %>%
tidyr::unnest_wider(are_top_dest_included)
}
# OP collapse
op_collapse <- function() {
library(magrittr)
library(collapse)
library(data.table)
my_flights_raw %>%
collapse::funique() %>%
collapse::fgroup_by(carrier, month) %>%
collapse::fsummarise(nested_dest = list(dest)) %>%
collapse::ftransform(new_col = lapply(nested_dest, \(x) my_flights_top_dest_across_months %in% x)) %>%
collapse::fcompute(., data.table::transpose(new_col), keep = 1:2) %>%
setNames(c("carrier", "month", my_flights_top_dest_across_months)) %>%
collapse::qTBL()
}
# Thomas data.table
thomas_data.table <- function() {
library(data.table)
my_flights_top_dest_across_months <-
data.table(
dest = unique(my_flights_top_5_by_month$dest),
fd = 1
)
dcast(my_flights_top_dest_across_months[
setDT(my_flights_raw),
on = .(dest)
],
carrier + month ~ dest,
fun.aggregate = function(x) sum(x) > 0
)[, c(
"carrier", "month",
my_flights_top_dest_across_months$dest
), with = FALSE]
}
output_op_slow <- op_slow()
output_op_collapse <- op_collapse()
output_thomas <- thomas_data.table()
#> Using 'month' as value column. Use 'value.var' to override
waldo::compare(output_op_slow, output_op_collapse, ignore_attr = TRUE)
#> v No differences
waldo::compare(output_op_slow, as_tibble(output_thomas), ignore_attr = TRUE)
#> v No differences
bm <- bench::mark(op_slow = op_slow(),
op_collapse = op_collapse(),
thomas_dt = thomas_data.table(),
check = FALSE,
iterations = 100)
ggplot2::autoplot(bm)
Speed up this loop to create dummy columns with data.table and set in R
Here's a different approach that, performs better - on my machine - than the original approach in the question
1) Get unique days except Sunday
Day <- setdiff(dt$Week_Day, "Sunday")
2) Initialize new columns with 0:
dt[, (Day) := 0L]
3) Update with 1s by reference in a loop:
for(x in Day) {
set(dt, i = which(dt[["Week_Day"]] == x), j = x, value = 1L)
}
Simple performance comparison:
dt1 <- data.table(Week_Day = sample(c("Monday", "Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"), 3e5, TRUE))
dt2 <- copy(dt1)
system.time({
Day <- setdiff(unique(dt$Week_Day), "Sunday")
dt1[, (Day) := 0L]
for(x in Day) {
set(dt1, i = which(dt1[["Week_Day"]] == x), j = x, value = 1L)
}
})
# User System verstrichen
# 0.029 0.003 0.032
system.time({
Day <- unique(dt$Week_Day)
for (i in 1:length(Day)) {
if (Day[i] != "Sunday") {
dt2[, Day[i] := ifelse(Week_Day == Day[i], 1L, 0L)]
}
}
})
# User System verstrichen
# 0.138 0.070 0.210
all.equal(dt1, dt2)
#[1] TRUE
Related Topics
How to Generate a Matrix of Combinations
Different Size Facets Proportional of X Axis on Ggplot 2 R
How to Get Name of Variable in R (Substitute)
Moving Average of Previous Three Values in R
R - Use Rbind on Multiple Variables with Similar Names
Create a Data Frame of Unequal Lengths
How to Use the Strsplit Function with a Period
Select Values from Different Columns Based on a Variable Containing Column Names
Problems When Trying to Load a Package in R Due to Rjava
Add Empty Columns to a Dataframe with Specified Names from a Vector
Change Colors in Ggpairs Now That Params Is Deprecated
Non-Standard Evaluation (Nse) in Dplyr's Filter_ & Pulling Data from MySQL
Plotting Multiple Time-Series in Ggplot