R - Count All Combinations

R - count all combinations

We can either use data.table or dplyr. These are very efficient. We convert the 'data.frame' to 'data.table' (setDT(dt)), grouped by all the columns of 'dt' (names(dt)), we get the nrow (.N) as the 'Count'

library(data.table)
setDT(dt)[,list(Count=.N) ,names(dt)]

Or we can use a similar methodology using dplyr.

library(dplyr)
names(dt) <- make.names(names(dt))
dt %>%
group_by_(.dots=names(dt)) %>%
summarise(count= n())

Benchmarks

In case somebody wants to look at some metrics (and also to backup my claim earlier (efficient!)),

set.seed(24)
df1 <- as.data.frame(matrix(sample(0:1, 1e6*6, replace=TRUE), ncol=6))

akrunDT <- function() {
as.data.table(df1)[,list(Count=.N) ,names(df1)]
}

akrunDplyr <- function() {
df1 %>%
group_by_(.dots=names(df1)) %>%
summarise(count= n())
}

cathG <- function() {
aggregate(cbind(n = 1:nrow(df1))~., df1, length)
}

docendoD <- function() {
as.data.frame(table(comb = do.call(paste, df1)))
}

deena <- function() {
table(apply(df1, 1, paste, collapse = ","))
}

Here are the microbenchmark results

library(microbenchmark)
microbenchmark(akrunDT(), akrunDplyr(), cathG(), docendoD(), deena(),
unit='relative', times=20L)
# Unit: relative
# expr min lq mean median uq max neval cld
# akrunDT() 1.000000 1.000000 1.000000 1.00000 1.000000 1.0000000 20 a
# akrunDplyr() 1.512354 1.523357 1.307724 1.45907 1.365928 0.7539773 20 a
# cathG() 43.893946 43.592062 37.008677 42.10787 38.556726 17.9834245 20 c
# docendoD() 18.778534 19.843255 16.560827 18.85707 17.296812 8.2688541 20 b
# deena() 90.391417 89.449547 74.607662 85.16295 77.316143 34.6962954 20 d

count number of combinations by group

Create a "combination" column in summarise, we can count this column afterwards.

An easy way to count the category is to order them at the beginning, then in this case they will all be in the same order.

library(dplyr)

dd %>%
group_by(id) %>%
arrange(id, cat) %>%
summarize(combination = paste0(cat, collapse = "-"), .groups = "drop") %>%
count(combination)

# A tibble: 3 x 2
combination n
<chr> <int>
1 c-d-f 1
2 c-f 2
3 d-f 2

How to find all combinations in column and count occurrences in data

If I have understood you correctly, you need to group_by PersonID and paste the all the unique Animals in the group and count the number of occurrence of their combination which can be done counting the number of rows in the group (n()) and dividing it by number of distinct values (n_distinct).

library(dplyr)

df %>%
group_by(PersonID) %>%
summarise(AnimalComb = paste(unique(Animal), collapse = ""),
CountbyID = n() / n_distinct(Animal))

# PersonID AnimalComb CountbyID
# <int> <chr> <dbl>
#1 1 DogBird 1
#2 2 SnakeSpider 1
#3 3 Cat 1
#4 4 CatDog 1

count unique combinations of variable values in an R dataframe column

An option with tidyverse where group by 'id', paste the 'status' and get the count

library(dplyr)
library(stringr)
df %>%
group_by(id) %>%
summarise(status = str_c(status, collapse="")) %>%
count(status)
# A tibble: 4 x 2
# status n
# <chr> <int>
#1 abc 2
#2 b 1
#3 bc 2
#4 bcd 2

How do you count all possible answer combinations

We may use combn

v1 <- combn(DF, 2, FUN = function(x) sum(Reduce(`&`, x)))
names(v1) <- combn(names(DF), 2, FUN = paste, collapse="_")

-output

> v1
Var1_Var2 Var1_Var3 Var1_Var4 Var1_Var5 Var1_Var6 Var1_Var7 Var1_Var8 Var2_Var3 Var2_Var4 Var2_Var5 Var2_Var6 Var2_Var7 Var2_Var8 Var3_Var4 Var3_Var5
3 3 3 2 3 3 1 3 3 2 3 3 1 3 2
Var3_Var6 Var3_Var7 Var3_Var8 Var4_Var5 Var4_Var6 Var4_Var7 Var4_Var8 Var5_Var6 Var5_Var7 Var5_Var8 Var6_Var7 Var6_Var8 Var7_Var8
3 4 1 2 3 3 1 2 2 1 3 1 1

IF we need for 2 to 5 combinations, use lapply

lst1 <- lapply(2:5, function(i) {
v1 <- combn(DF, i, FUN = function(x) sum(Reduce(`&`, x)))
names(v1) <- combn(names(DF), i, FUN = paste, collapse="_")
v1
})

Complete with all combinations after counting on data.table

Here is one possible way to solve your problem. Note that the argument with=FALSE in the data.table context allows to select the columns using the standard data.frame rules. In the example below, I assumed that the columns used to compute all combinations are passed to myfun as a character vector.
Keep in mind that no columns in your dataset should be named gcases. .EACHI in by allows to perform some operation for each row in i.

myfun = function(d, g) {
# get levels (for factors) and unique values for other types.
fn <- function(x) if(is.factor(x)) levels(x) else unique(x)
gcases <- lapply(setDT(d, key=g)[, g, with=FALSE], fn)

# count based on all combinations
d[do.call(CJ, gcases), .N, keyby=.EACHI]
}

R List combinations of items with count of these

I use a ; separator because it seems nicer, but here is a dplyr version:

library(dplyr)
df %>%
group_by(Account) %>%
summarize(combo = paste(sort(Product), collapse = ";"), .groups = "drop") %>%
count(combo)
# # A tibble: 3 × 2
# combo Count
# <chr> <int>
# 1 a;b 1
# 2 a;b;c 1
# 3 a;c 2

Using this data:

df = read.table(text = ' Account  Product 
1 a
1 b
1 c
2 a
2 c
3 a
3 c
4 a
4 b', header = T)

Count unique combinations in and summarize other columns in new one

We could use return as a list

library(data.table)
dt[, .(N = .N, new_col = .(d)), by = .(a, b, c)]
a b c N new_col
<char> <char> <char> <int> <list>
1: 1a 1b 1c 2 n1,n2
2: 2a 2b 2c 4 n1,n2,n3,n4


Related Topics



Leave a reply



Submit