Empty Factors in "By" Data.Table

Empty factors in by data.table

library(data.table)
set.seed(42)
dtr <- data.table(v1=sample(1:15),
v2=factor(sample(letters[1:3], 15, replace = TRUE),levels=letters[1:5]),
v3=sample(c("yes", "no"), 15, replace = TRUE))

res <- dtr[,list(freq=.N,mm=sum(v1,na.rm=T)),by=list(v2,v3)]

You can use CJ (a cross join). Doing this after aggregation avoids setting the key for the big table and should be faster.

setkey(res,c("v2","v3"))
res[CJ(levels(dtr[,v2]),unique(dtr[,v3])),]

# v2 v3 freq mm
# 1: a no 1 9
# 2: a yes 2 11
# 3: b no 2 11
# 4: b yes 3 23
# 5: c no 4 40
# 6: c yes 3 26
# 7: d no NA NA
# 8: d yes NA NA
# 9: e no NA NA
# 10: e yes NA NA

Use a factor column in by and do not drop empty factors

If you are willing to run through the factor levels by enumerating them in i (rather than by setting by="group"), this will get you the hoped for results.

setkey(x, "group")
x[levels(group), .N, by=.EACHI]
# group N
# 1: a 2
# 2: b 1
# 3: c 0

aggregate with empty factor but keep row

# create dataset
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

library(dplyr)

levels(df2$factor) %>% # get distinct levels of the factor variable
data.frame(factor = .) %>% # create a data frame
left_join(df2 %>% # join with
group_by(factor) %>% # for each value that exists
summarise(x = sum(val)), by = "factor") %>% # sum column val
mutate(x = coalesce(x, 0L)) # replace NAs with 0s

# factor x
# 1 A 12
# 2 B 12
# 3 C 0

Or without any package

dd = merge(data.frame(Group.1 = levels(df2$factor)), 
aggregate(df2$val,list(df2$factor),sum), all.x = T)
dd$x = ifelse(is.na(dd$x), 0, dd$x)
dd

# Group.1 x
# 1 A 12
# 2 B 12
# 3 C 0

Or using data.table package to check if it's faster

library(data.table)

# assuming you start with a data frame
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

# create a data table with all unique values of the variable "factor" and an index (key) on that variable
dt_levels = data.table(factor = levels(df2$factor), key = "factor")

# make df2 a data table with an index on column "factor" and aggregate
dt_sum = setDT(df2, key = "factor")[, list(Sum = sum(val)), by = "factor"]

# left join the two data tables and replace NA values with 0s
dt_result = dt_sum[dt_levels][, Sum := ifelse(is.na(Sum), 0, Sum)]

dt_result[]

# factor Sum
# 1: A 12
# 2: B 12
# 3: C 0

Complete with all combinations after counting on data.table

Here is one possible way to solve your problem. Note that the argument with=FALSE in the data.table context allows to select the columns using the standard data.frame rules. In the example below, I assumed that the columns used to compute all combinations are passed to myfun as a character vector.
Keep in mind that no columns in your dataset should be named gcases. .EACHI in by allows to perform some operation for each row in i.

myfun = function(d, g) {
# get levels (for factors) and unique values for other types.
fn <- function(x) if(is.factor(x)) levels(x) else unique(x)
gcases <- lapply(setDT(d, key=g)[, g, with=FALSE], fn)

# count based on all combinations
d[do.call(CJ, gcases), .N, keyby=.EACHI]
}

Ordering factors in data table using DT package

You could use a hidden column with the numeric value of the factor and sort the factors according to that hidden column:

library('DT')
value <- factor(c(7.5, 12.5, 7.5, 17.5),
levels = c(7.5, 12.5, 17.5),
labels = c('5-10%', '10-15%', '15-20%'))

example <- data.frame(name = c('A', 'B', 'C', 'D'),
value = value,
levels=as.numeric(value))

datatable(example,
rownames = FALSE,
options = list(columnDefs=list(list(orderData=2,targets=1),
list(visible=FALSE,targets=2))))


Related Topics



Leave a reply



Submit