Table of Interactions - Case With Pets and Houses

Table of Interactions - Case with pets and houses

Use table and crossprod:

out <- crossprod(table(houses, animals))
out[lower.tri(out, diag=TRUE)] <- NA
out
#         animals
# animals  cat dog rat snake spider
#   cat     NA   1   2     1      1
#   dog     NA  NA   0     0      0
#   rat     NA  NA  NA     1      1
#   snake   NA  NA  NA    NA      1
#   spider  NA  NA  NA    NA     NA

Since the output is a matrix you can suppress the printing of the NA values directly in print:

print(out,na.print="")
#         animals
# animals  cat dog rat snake spider
#   cat          1   2     1      1
#   dog              0     0      0
#   rat                    1      1
#   snake                         1
#   spider

Table of interactions - case with pets and houses 2

Make all values > 0 from table equal to "1" before using crossprod:

(table(houses, animals) > 0) *1
#       animals
# houses cat dog rat spider
#      1   1   1   0      0
#      2   1   0   0      0
#      3   0   1   0      0
#      4   1   0   1      1
#      5   1   0   1      0
#      6   1   0   0      0

out <- crossprod((table(houses, animals) > 0) *1)
out[lower.tri(out, diag=TRUE)] <- NA
as.table(out)
#         animals
# animals  cat dog rat spider
#   cat          1   2      1
#   dog              0      0
#   rat                     1
#   spider

To get to the desired output, since we know that the first column and the last row will be empty, and since you already figured out on your own that as.table would take care of not printing the NA values, continuing from above, you can do:

out <- as.table(out[-nrow(out), -1])
out
#        animals
# animals dog rat spider
#     cat   1   2      1
#     dog       0      0
#     rat              1

Create a table with all possible interactions (2-way and 3-way)

This should get you both Frequency and Revenue - I'm assuming you want to combine each customer's order into a combination:

require(data.table); setDT(DFI)

DFI[order(Product)
  ][, .(Combination= paste(Product, collapse=", "), Revenue = sum(Revenue)) , by=.(Customer)
  ][, .(.N, Revenue= sum(Revenue)), by=.(Combination)]

                  Combination N Revenue
1: Rice, Sweet Potato, Walnut 1      16
2:               Rice, Walnut 1       5
3:                       Rice 2       5
4:         Rice, Sweet Potato 1       5
5:       Sweet Potato, Walnut 2      17
6:               Sweet Potato 1       4
7:                     Walnut 1       7

You might find it helpful to look at each chained statement one at a time to see what's happening at each step. The only specific thing I'll mention is that we start with DFI[order(Product)] to make sure that our generated combinations are consistent, so we don't end up with "Rice, Potato" and "Potato, Rice"

How can I count the number of times one item has been grouped together with another in R?

We could use base R methods, use table to get the frequency, do a crossprod, set the diagonal and lower triangle elements to NA and remove the NA rows after converting to data.frame

m1 <- crossprod(table(df1))
m1[lower.tri(m1, diag = TRUE)] <- NA
subset(as.data.frame.table(m1), !is.na(Freq))
#    Country Country.1 Freq
#4      DE        FI    1
#7      DE        SE    2
#8      FI        SE    2

data

df1 <- structure(list(Group = c("Group1", "Group1", "Group2", "Group2", 
"Group2", "Group3", "Group3"), Country = c("SE", "DE", "SE", 
"DE", "FI", "SE", "FI")), .Names = c("Group", "Country"),
 class = "data.frame", row.names = c(NA, -7L))

Get all combinations of shared elements in a list

We can use outer

outer(names(terms), names(terms), FUN = function(x,y) 
              lengths(Map(intersect, terms[x], terms[y])))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2

Or more compactly

outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
#      Item1 Item2 Item3 Item4
#Item1     4     1     1     0
#Item2     1     4     1     0
#Item3     1     1     4     0
#Item4     0     0     0     2

We could also implement this in Rcpp. Below is the test1.cpp file

#include <Rcpp.h>
#include <math.h>

using namespace Rcpp;
//[[Rcpp::export]]

List foo(List xs) {
    List x(xs);
    List x1 = Rcpp::clone(xs);
    List y1 = Rcpp::clone(xs);
    int n = x1.size();



    NumericVector res;


    for( int i=0; i<n; i++){
        for(int j=0; j<n; j++){
         CharacterVector xd = x1[i];
         CharacterVector yd = y1[j];

        res.push_back(intersect(xd, yd).length());
        }
    }
    return wrap(res) ;

We call it in R using

library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
#     [,1] [,2] [,3] [,4]
#[1,]    4    1    1    0
#[2,]    1    4    1    0
#[3,]    1    1    4    0
#[4,]    0    0    0    2

Benchmarks

In addition to the functions above, we included another version with a RcppEigen implementation that was posted here

n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
         replace = TRUE)), paste0("Item", seq_len(n)))

library(Matrix)
library(inline)
library(Rcpp)

alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <-  function() {Matrix::crossprod(xtabs( ~ values + ind, 
            stack(terms1), sparse = TRUE)) }

akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
                      funCPr(tbl, tbl)[[1]]}

db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
    sapply(terms1, function(a)
        sum(unlist(terms1[i]) %in% unlist(a)))))} 
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
                      combn(seq_along(terms1), 2,
                            function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
         c("col1", "col2", "counts"))}

and the benchmark output for n at 100 are

library(microbenchmark)
microbenchmark(alexis1(), alexis2(),   akrun1(), akrun2(),akrun3(), db(), lmo(),
           unit = "relative", times = 10L)
#Unit: relative
#      expr        min         lq       mean     median         uq        max neval  cld
# alexis1()   1.035975   1.032101   1.031239   1.010472   1.044217   1.129092    10 a   
# alexis2()   3.896928   3.656585   3.461980   3.386301   3.335469   3.288161    10 a   
#  akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661    10    d
#  akrun2()  84.239272  79.073087  88.594414  75.719853  78.277769 129.731990    10  b  
#  akrun3()   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10 a   
#      db()  86.921164  82.201117  80.358097  75.113471  73.311414 105.761977    10  b  
#     lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212    10   c

With a slightly higher n at 200

n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
      replace = TRUE)), paste0("Item", seq_len(n)))

microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
#      expr        min         lq       mean     median         uq        max neval cld
# alexis1()   1.117234   1.164198   1.181280   1.166070   1.230077   1.229899    10 a  
# alexis2()   3.428904   3.425942   3.337112   3.379675   3.280729   3.164852    10  b 
#  akrun3()   1.000000   1.000000   1.000000   1.000000   1.000000   1.000000    10 a  
#      db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635    10   c

With n set at 9000

n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10), 
                replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(),  akrun3(), unit = "relative", times = 10L)
#Unit: relative
#     expr      min       lq     mean   median       uq      max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329    10  b 
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927    10   c
#  akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000    10 a

Checking the output

res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE

Based on the comments from @alexis_laz included 3 more functions to replace the table/stack part to compare the efficiency for n at 9000

alexis3 <- function() {
    unlt = unlist(terms1, use.names = FALSE)
    u = unique(unlt)
    tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
    tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
    crossprod(tab, tab)
    }

alexis4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)

       tab = sparseMatrix(x = 1L, i = match(unlt, u),
           j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))

       Matrix::crossprod(tab, tab, sparse = TRUE)
       }

akrun4 <- function() {
        unlt = unlist(terms1, use.names = FALSE)
        u = unique(unlt)
        tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
        tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
        funCPr(tab, tab)[[1]]
      }

and the benchmarks are

microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
         akrun3(), akrun4(),  unit = "relative", times = 10L)
#Unit: relative
#      expr       min        lq     mean   median       uq      max neval  cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552    10  b  
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039    10   c 
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645    10  b  
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607    10    d
#  akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000    10 a   
#  akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245    10 a

Get number of same individuals for different groups

Here's my version:

# size-1 IDs can't contribute; skip
DT[ , if (.N > 1) 
  # simplify = FALSE returns a list;
  #   transpose turns the 3-length list of 2-length vectors
  #   into a length-2 list of 3-length vectors (efficiently)
  transpose(combn(Group, 2L, simplify = FALSE)), by = ID
  ][ , .(Sum = .N), keyby = .(Group.1 = V1, Group.2 = V2)]

With output:

#    Group.1 Group.2 Sum
# 1:       A       B   2
# 2:       A       C   3
# 3:       B       C   3

R: find most frequent combinations within same id

For this one, I reached for data.table, but you could use tidyr just as easily.

library(data.table)
set.seed(213) # set seed
d <- data.table(ID = rep(1:3, each = 3), drug = paste0("drug", sample(1:5, 9, rep = T))) 

get_combs <- function(x, n = 2){
  uniq_x <- sort(unique(x))
  if(length(uniq_x) < n){
    return(NULL)
  } else {
    return(as.data.frame(t(combn(uniq_x, n)), stringsAsFactors = FALSE))
  }

}

combi <- d[, get_combs(drug), by = ID][order(V1, V2),]
combi[ , .N, by = .(V1, V2)]

      V1    V2 N
1: drug1 drug2 2
2: drug1 drug4 2
3: drug2 drug4 2
4: drug3 drug5 1

Finding Number of Groups that Contain Specific Pairs in Data Frame

We merge the dataset with itself by 'name', sort the 'value' columns by 'row', convert the dataset to data.table, remove the rows with the same 'value' elements, grouped by the 'value' columns, get the nrow (.N) and divide by 2.

d1 <- merge(df, df, by.x='name', by.y='name')
d1[-1] <- t(apply(d1[-1], 1, sort))
library(data.table)
setDT(d1)[value.x!=value.y][,.N/2 ,.(value.x, value.y)]
#   value.x value.y V1
#1:       e       f  2
#2:       e       g  1
#3:       f       g  1

Or using a similar method as in @jeremycg's post

 setDT(df)[df, on='name', allow.cartesian=TRUE
     ][as.character(value)< as.character(i.value), .N, .(value, i.value)]