Table of Interactions - Case With Pets and Houses

Table of Interactions - Case with pets and houses

Use table and crossprod:

out <- crossprod(table(houses, animals))
out[lower.tri(out, diag=TRUE)] <- NA
out
# animals
# animals cat dog rat snake spider
# cat NA 1 2 1 1
# dog NA NA 0 0 0
# rat NA NA NA 1 1
# snake NA NA NA NA 1
# spider NA NA NA NA NA

Since the output is a matrix you can suppress the printing of the NA values directly in print:

print(out,na.print="")
# animals
# animals cat dog rat snake spider
# cat 1 2 1 1
# dog 0 0 0
# rat 1 1
# snake 1
# spider

Table of interactions - case with pets and houses 2

Make all values > 0 from table equal to "1" before using crossprod:

(table(houses, animals) > 0) *1
# animals
# houses cat dog rat spider
# 1 1 1 0 0
# 2 1 0 0 0
# 3 0 1 0 0
# 4 1 0 1 1
# 5 1 0 1 0
# 6 1 0 0 0

out <- crossprod((table(houses, animals) > 0) *1)
out[lower.tri(out, diag=TRUE)] <- NA
as.table(out)
# animals
# animals cat dog rat spider
# cat 1 2 1
# dog 0 0
# rat 1
# spider

To get to the desired output, since we know that the first column and the last row will be empty, and since you already figured out on your own that as.table would take care of not printing the NA values, continuing from above, you can do:

out <- as.table(out[-nrow(out), -1])
out
# animals
# animals dog rat spider
# cat 1 2 1
# dog 0 0
# rat 1

Create a table with all possible interactions (2-way and 3-way)

This should get you both Frequency and Revenue - I'm assuming you want to combine each customer's order into a combination:

require(data.table); setDT(DFI)

DFI[order(Product)
][, .(Combination= paste(Product, collapse=", "), Revenue = sum(Revenue)) , by=.(Customer)
][, .(.N, Revenue= sum(Revenue)), by=.(Combination)]

Combination N Revenue
1: Rice, Sweet Potato, Walnut 1 16
2: Rice, Walnut 1 5
3: Rice 2 5
4: Rice, Sweet Potato 1 5
5: Sweet Potato, Walnut 2 17
6: Sweet Potato 1 4
7: Walnut 1 7

You might find it helpful to look at each chained statement one at a time to see what's happening at each step. The only specific thing I'll mention is that we start with DFI[order(Product)] to make sure that our generated combinations are consistent, so we don't end up with "Rice, Potato" and "Potato, Rice"

How can I count the number of times one item has been grouped together with another in R?

We could use base R methods, use table to get the frequency, do a crossprod, set the diagonal and lower triangle elements to NA and remove the NA rows after converting to data.frame

m1 <- crossprod(table(df1))
m1[lower.tri(m1, diag = TRUE)] <- NA
subset(as.data.frame.table(m1), !is.na(Freq))
# Country Country.1 Freq
#4 DE FI 1
#7 DE SE 2
#8 FI SE 2

data

df1 <- structure(list(Group = c("Group1", "Group1", "Group2", "Group2", 
"Group2", "Group3", "Group3"), Country = c("SE", "DE", "SE",
"DE", "FI", "SE", "FI")), .Names = c("Group", "Country"),
class = "data.frame", row.names = c(NA, -7L))

Get all combinations of shared elements in a list

We can use outer

outer(names(terms), names(terms), FUN = function(x,y) 
lengths(Map(intersect, terms[x], terms[y])))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2

Or more compactly

outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
# Item1 Item2 Item3 Item4
#Item1 4 1 1 0
#Item2 1 4 1 0
#Item3 1 1 4 0
#Item4 0 0 0 2

We could also implement this in Rcpp. Below is the test1.cpp file

#include <Rcpp.h>
#include <math.h>

using namespace Rcpp;
//[[Rcpp::export]]

List foo(List xs) {
List x(xs);
List x1 = Rcpp::clone(xs);
List y1 = Rcpp::clone(xs);
int n = x1.size();



NumericVector res;


for( int i=0; i<n; i++){
for(int j=0; j<n; j++){
CharacterVector xd = x1[i];
CharacterVector yd = y1[j];

res.push_back(intersect(xd, yd).length());
}
}
return wrap(res) ;

We call it in R using

library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2

Benchmarks

In addition to the functions above, we included another version with a RcppEigen implementation that was posted here

n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))

library(Matrix)
library(inline)
library(Rcpp)

alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <- function() {Matrix::crossprod(xtabs( ~ values + ind,
stack(terms1), sparse = TRUE)) }

akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
funCPr(tbl, tbl)[[1]]}

db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
sapply(terms1, function(a)
sum(unlist(terms1[i]) %in% unlist(a)))))}
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
combn(seq_along(terms1), 2,
function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
c("col1", "col2", "counts"))}

and the benchmark output for n at 100 are

library(microbenchmark)
microbenchmark(alexis1(), alexis2(), akrun1(), akrun2(),akrun3(), db(), lmo(),
unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.035975 1.032101 1.031239 1.010472 1.044217 1.129092 10 a
# alexis2() 3.896928 3.656585 3.461980 3.386301 3.335469 3.288161 10 a
# akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661 10 d
# akrun2() 84.239272 79.073087 88.594414 75.719853 78.277769 129.731990 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 86.921164 82.201117 80.358097 75.113471 73.311414 105.761977 10 b
# lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212 10 c

With a slightly higher n at 200

n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))

microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.117234 1.164198 1.181280 1.166070 1.230077 1.229899 10 a
# alexis2() 3.428904 3.425942 3.337112 3.379675 3.280729 3.164852 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635 10 c

With n set at 9000

n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(), akrun3(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329 10 b
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927 10 c
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a

Checking the output

res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE

Based on the comments from @alexis_laz included 3 more functions to replace the table/stack part to compare the efficiency for n at 9000

alexis3 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
crossprod(tab, tab)
}

alexis4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)

tab = sparseMatrix(x = 1L, i = match(unlt, u),
j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))

Matrix::crossprod(tab, tab, sparse = TRUE)
}

akrun4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
funCPr(tab, tab)[[1]]
}

and the benchmarks are

microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
akrun3(), akrun4(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552 10 b
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039 10 c
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645 10 b
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607 10 d
# akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000 10 a
# akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245 10 a

Get number of same individuals for different groups

Here's my version:

# size-1 IDs can't contribute; skip
DT[ , if (.N > 1)
# simplify = FALSE returns a list;
# transpose turns the 3-length list of 2-length vectors
# into a length-2 list of 3-length vectors (efficiently)
transpose(combn(Group, 2L, simplify = FALSE)), by = ID
][ , .(Sum = .N), keyby = .(Group.1 = V1, Group.2 = V2)]

With output:

#    Group.1 Group.2 Sum
# 1: A B 2
# 2: A C 3
# 3: B C 3

R: find most frequent combinations within same id

For this one, I reached for data.table, but you could use tidyr just as easily.

library(data.table)
set.seed(213) # set seed
d <- data.table(ID = rep(1:3, each = 3), drug = paste0("drug", sample(1:5, 9, rep = T)))

get_combs <- function(x, n = 2){
uniq_x <- sort(unique(x))
if(length(uniq_x) < n){
return(NULL)
} else {
return(as.data.frame(t(combn(uniq_x, n)), stringsAsFactors = FALSE))
}

}

combi <- d[, get_combs(drug), by = ID][order(V1, V2),]
combi[ , .N, by = .(V1, V2)]

V1 V2 N
1: drug1 drug2 2
2: drug1 drug4 2
3: drug2 drug4 2
4: drug3 drug5 1

Finding Number of Groups that Contain Specific Pairs in Data Frame

We merge the dataset with itself by 'name', sort the 'value' columns by 'row', convert the dataset to data.table, remove the rows with the same 'value' elements, grouped by the 'value' columns, get the nrow (.N) and divide by 2.

d1 <- merge(df, df, by.x='name', by.y='name')
d1[-1] <- t(apply(d1[-1], 1, sort))
library(data.table)
setDT(d1)[value.x!=value.y][,.N/2 ,.(value.x, value.y)]
# value.x value.y V1
#1: e f 2
#2: e g 1
#3: f g 1

Or using a similar method as in @jeremycg's post

 setDT(df)[df, on='name', allow.cartesian=TRUE
][as.character(value)< as.character(i.value), .N, .(value, i.value)]


Related Topics



Leave a reply



Submit