Table of Interactions - Case with pets and houses
Use table
and crossprod
:
out <- crossprod(table(houses, animals))
out[lower.tri(out, diag=TRUE)] <- NA
out
# animals
# animals cat dog rat snake spider
# cat NA 1 2 1 1
# dog NA NA 0 0 0
# rat NA NA NA 1 1
# snake NA NA NA NA 1
# spider NA NA NA NA NA
Since the output is a matrix
you can suppress the printing of the NA
values directly in print
:
print(out,na.print="")
# animals
# animals cat dog rat snake spider
# cat 1 2 1 1
# dog 0 0 0
# rat 1 1
# snake 1
# spider
Table of interactions - case with pets and houses 2
Make all values > 0
from table
equal to "1" before using crossprod
:
(table(houses, animals) > 0) *1
# animals
# houses cat dog rat spider
# 1 1 1 0 0
# 2 1 0 0 0
# 3 0 1 0 0
# 4 1 0 1 1
# 5 1 0 1 0
# 6 1 0 0 0
out <- crossprod((table(houses, animals) > 0) *1)
out[lower.tri(out, diag=TRUE)] <- NA
as.table(out)
# animals
# animals cat dog rat spider
# cat 1 2 1
# dog 0 0
# rat 1
# spider
To get to the desired output, since we know that the first column and the last row will be empty, and since you already figured out on your own that as.table
would take care of not printing the NA
values, continuing from above, you can do:
out <- as.table(out[-nrow(out), -1])
out
# animals
# animals dog rat spider
# cat 1 2 1
# dog 0 0
# rat 1
Create a table with all possible interactions (2-way and 3-way)
This should get you both Frequency and Revenue - I'm assuming you want to combine each customer's order into a combination:
require(data.table); setDT(DFI)
DFI[order(Product)
][, .(Combination= paste(Product, collapse=", "), Revenue = sum(Revenue)) , by=.(Customer)
][, .(.N, Revenue= sum(Revenue)), by=.(Combination)]
Combination N Revenue
1: Rice, Sweet Potato, Walnut 1 16
2: Rice, Walnut 1 5
3: Rice 2 5
4: Rice, Sweet Potato 1 5
5: Sweet Potato, Walnut 2 17
6: Sweet Potato 1 4
7: Walnut 1 7
You might find it helpful to look at each chained statement one at a time to see what's happening at each step. The only specific thing I'll mention is that we start with DFI[order(Product)]
to make sure that our generated combinations are consistent, so we don't end up with "Rice, Potato" and "Potato, Rice"
How can I count the number of times one item has been grouped together with another in R?
We could use base R
methods, use table
to get the frequency, do a crossprod
, set the diagonal and lower triangle elements to NA
and remove the NA
rows after converting to data.frame
m1 <- crossprod(table(df1))
m1[lower.tri(m1, diag = TRUE)] <- NA
subset(as.data.frame.table(m1), !is.na(Freq))
# Country Country.1 Freq
#4 DE FI 1
#7 DE SE 2
#8 FI SE 2
data
df1 <- structure(list(Group = c("Group1", "Group1", "Group2", "Group2",
"Group2", "Group3", "Group3"), Country = c("SE", "DE", "SE",
"DE", "FI", "SE", "FI")), .Names = c("Group", "Country"),
class = "data.frame", row.names = c(NA, -7L))
Get all combinations of shared elements in a list
We can use outer
outer(names(terms), names(terms), FUN = function(x,y)
lengths(Map(intersect, terms[x], terms[y])))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Or more compactly
outer(terms, terms, FUN = function(...) lengths(Map(intersect, ...)))
# Item1 Item2 Item3 Item4
#Item1 4 1 1 0
#Item2 1 4 1 0
#Item3 1 1 4 0
#Item4 0 0 0 2
We could also implement this in Rcpp
. Below is the test1.cpp
file
#include <Rcpp.h>
#include <math.h>
using namespace Rcpp;
//[[Rcpp::export]]
List foo(List xs) {
List x(xs);
List x1 = Rcpp::clone(xs);
List y1 = Rcpp::clone(xs);
int n = x1.size();
NumericVector res;
for( int i=0; i<n; i++){
for(int j=0; j<n; j++){
CharacterVector xd = x1[i];
CharacterVector yd = y1[j];
res.push_back(intersect(xd, yd).length());
}
}
return wrap(res) ;
We call it in R
using
library(Rcpp)
sourceCpp("test1.cpp")
`dim<-`(unlist(foo(terms)), c(4, 4))
# [,1] [,2] [,3] [,4]
#[1,] 4 1 1 0
#[2,] 1 4 1 0
#[3,] 1 1 4 0
#[4,] 0 0 0 2
Benchmarks
In addition to the functions above, we included another version with a RcppEigen
implementation that was posted here
n <- 100
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
library(Matrix)
library(inline)
library(Rcpp)
alexis1 <- function() {crossprod(table(stack(terms1)))}
alexis2 <- function() {Matrix::crossprod(xtabs( ~ values + ind,
stack(terms1), sparse = TRUE)) }
akrun1 <- function(){outer(terms1, terms1, FUN = function(...) lengths(Map(intersect, ...)))}
akrun2 <- function() {`dim<-`(unlist(foo(terms1)), c(n, n))}
akrun3 <- function() {tbl <- table(stack(terms1))
funCPr(tbl, tbl)[[1]]}
db <- function() {do.call(rbind, lapply(1:length(terms1), function(i)
sapply(terms1, function(a)
sum(unlist(terms1[i]) %in% unlist(a)))))}
lmo <- function() { setNames(data.frame(t(combn(names(terms1), 2)),
combn(seq_along(terms1), 2,
function(x) length(intersect(terms1[[x[1]]], terms1[[x[2]]])))),
c("col1", "col2", "counts"))}
and the benchmark output for n at 100
are
library(microbenchmark)
microbenchmark(alexis1(), alexis2(), akrun1(), akrun2(),akrun3(), db(), lmo(),
unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.035975 1.032101 1.031239 1.010472 1.044217 1.129092 10 a
# alexis2() 3.896928 3.656585 3.461980 3.386301 3.335469 3.288161 10 a
# akrun1() 218.456708 207.099841 198.391784 189.356065 188.542712 214.415661 10 d
# akrun2() 84.239272 79.073087 88.594414 75.719853 78.277769 129.731990 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 86.921164 82.201117 80.358097 75.113471 73.311414 105.761977 10 b
# lmo() 125.128109 123.203318 118.732911 113.271352 113.164333 138.075212 10 c
With a slightly higher n at 200
n <- 200
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(), alexis2(), akrun3(), db(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 1.117234 1.164198 1.181280 1.166070 1.230077 1.229899 10 a
# alexis2() 3.428904 3.425942 3.337112 3.379675 3.280729 3.164852 10 b
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
# db() 219.971285 219.577403 207.793630 213.232359 196.122420 187.433635 10 c
With n set at 9000
n <- 9000
set.seed(24)
terms1 <- setNames(replicate(n, sample(letters, sample(10),
replace = TRUE)), paste0("Item", seq_len(n)))
microbenchmark(alexis1(),alexis2(), akrun3(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.048708 2.021709 2.009396 2.085750 2.141060 1.767329 10 b
# alexis2() 3.520220 3.518339 3.419368 3.616512 3.515993 2.952927 10 c
# akrun3() 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 10 a
Checking the output
res1 <- alexis1()
res2 <- akrun3()
res3 <- alexis2()
all.equal(res1, res2, check.attributes = FALSE)
#[1] TRUE
all.equal(res1, as.matrix(res3), check.attributes = FALSE)
#[1] TRUE
Based on the comments from @alexis_laz included 3 more functions to replace the table/stack
part to compare the efficiency for n at 9000
alexis3 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
crossprod(tab, tab)
}
alexis4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = sparseMatrix(x = 1L, i = match(unlt, u),
j = rep(seq_along(terms1), lengths(terms1)), dimnames = list(u, names(terms1)))
Matrix::crossprod(tab, tab, sparse = TRUE)
}
akrun4 <- function() {
unlt = unlist(terms1, use.names = FALSE)
u = unique(unlt)
tab = matrix(0L, length(u), length(terms1), dimnames = list(u, names(terms1)))
tab[cbind(match(unlt, u), rep(seq_along(terms1), lengths(terms1)))] = 1L
funCPr(tab, tab)[[1]]
}
and the benchmarks are
microbenchmark(alexis1(),alexis2(), alexis3(), alexis4(),
akrun3(), akrun4(), unit = "relative", times = 10L)
#Unit: relative
# expr min lq mean median uq max neval cld
# alexis1() 2.1888254 2.2897883 2.204237 2.169618 2.162955 2.122552 10 b
# alexis2() 3.7651292 3.9178071 3.672550 3.616577 3.587886 3.426039 10 c
# alexis3() 2.1776887 2.2410663 2.197293 2.137106 2.192834 2.241645 10 b
# alexis4() 4.1640895 4.3431379 4.262192 4.187449 4.388335 4.172607 10 d
# akrun3() 1.0000000 1.0000000 1.000000 1.000000 1.000000 1.000000 10 a
# akrun4() 0.9364288 0.9692772 1.043292 1.063931 1.090301 1.171245 10 a
Get number of same individuals for different groups
Here's my version:
# size-1 IDs can't contribute; skip
DT[ , if (.N > 1)
# simplify = FALSE returns a list;
# transpose turns the 3-length list of 2-length vectors
# into a length-2 list of 3-length vectors (efficiently)
transpose(combn(Group, 2L, simplify = FALSE)), by = ID
][ , .(Sum = .N), keyby = .(Group.1 = V1, Group.2 = V2)]
With output:
# Group.1 Group.2 Sum
# 1: A B 2
# 2: A C 3
# 3: B C 3
R: find most frequent combinations within same id
For this one, I reached for data.table, but you could use tidyr
just as easily.
library(data.table)
set.seed(213) # set seed
d <- data.table(ID = rep(1:3, each = 3), drug = paste0("drug", sample(1:5, 9, rep = T)))
get_combs <- function(x, n = 2){
uniq_x <- sort(unique(x))
if(length(uniq_x) < n){
return(NULL)
} else {
return(as.data.frame(t(combn(uniq_x, n)), stringsAsFactors = FALSE))
}
}
combi <- d[, get_combs(drug), by = ID][order(V1, V2),]
combi[ , .N, by = .(V1, V2)]
V1 V2 N
1: drug1 drug2 2
2: drug1 drug4 2
3: drug2 drug4 2
4: drug3 drug5 1
Finding Number of Groups that Contain Specific Pairs in Data Frame
We merge
the dataset with itself by 'name', sort
the 'value' columns by 'row', convert the dataset to data.table
, remove the rows with the same 'value' elements, grouped by the 'value' columns, get the nrow (.N
) and divide by 2.
d1 <- merge(df, df, by.x='name', by.y='name')
d1[-1] <- t(apply(d1[-1], 1, sort))
library(data.table)
setDT(d1)[value.x!=value.y][,.N/2 ,.(value.x, value.y)]
# value.x value.y V1
#1: e f 2
#2: e g 1
#3: f g 1
Or using a similar method as in @jeremycg's post
setDT(df)[df, on='name', allow.cartesian=TRUE
][as.character(value)< as.character(i.value), .N, .(value, i.value)]
Related Topics
How R Formats Posixct With Fractional Seconds
Read All Files in Directory and Apply Multiple Functions to Each Data Frame
How to Perform Natural (Lexicographic) Sorting in R
Dummy Variables from a String Variable
Dplyr Mutate/Replace Several Columns on a Subset of Rows
How to Make Execution Pause, Sleep, Wait For X Seconds in R
Compute Mean and Standard Deviation by Group For Multiple Variables in a Data.Frame
Pass Arguments to Dplyr Functions
Split Date-Time Column into Date and Time Variables
Create a Data.Frame Where a Column Is a List
Horizontal/Vertical Line in Plotly
How to Order Data by Value Within Ggplot Facets
Editing Legend (Text) Labels in Ggplot
Unique on a Dataframe With Only Selected Columns
Windows 7, Update.Packages Problem: "Unable to Move Temporary Installation"