How to Calculate a Table of Pairwise Counts from Long-Form Data Frame

How to calculate a table of pairwise counts from long-form data frame

Here is a data.table approach similar to @mrdwab

It will work best if featureCode is a character

library(data.table)

DT <- data.table(dat)
# convert to character
DT[, featureCode := as.character(featureCode)]
# subset those with >1 per id
DT2 <- DT[, N := .N, by = id][N>1]
# create all combinations of 2
# return as a data.table with these as columns `V1` and `V2`
# then count the numbers in each group
DT2[, rbindlist(combn(featureCode,2, 
      FUN = function(x) as.data.table(as.list(x)), simplify = F)), 
    by = id][, .N, by = list(V1,V2)]

     V1   V2 N
1: PPLC PCLI 3
2:  PPL PPLC 1
3:  PPL PCLI 1

Pairwise count data from long format

If the ordering in your data set is as in your example you can try this:

 library(dplyr)

 df %>% group_by(id) %>% 
        transmute(pos1 = pos, pos2 = lead(pos),
        group1 = group, group2 = lead(group)) %>%
        na.omit() %>% ungroup()%>%
        count(pos1, pos2, group1, group2, name = "id.count")
# A tibble: 4 x 5
#   pos1  pos2 group1 group2 id.count
#  <dbl> <dbl>  <dbl>  <dbl>    <int>
#     1     2    100    200        2
#     2     3    200    100        2
#     3     4    100    200        1
#     3     4    100    300        1

Counting pairwise occurences of characters in dataframe R

Here is a data.table approach. You can lag the V4 vector, sort and paste the lagged and original V4 to ensure that order between each pair does not matter (e.g. CD and DC are the same), then tabulate the count for each paired occurrence.

dat[, .(Pairs=mapply(function(x,y) paste(sort(c(x,y)), collapse=""), 
    shift(V4), V4)[-1]), by=V1][,
        .N, by=.(V1, Pairs)]

#     V1 Pairs N
#  1:  X   BE1 1
#  2:  X   DE1 1
#  3:  X   DE2 1
#  4:  X   BE2 1
#  5:  1  E1E2 1
#  6:  1   CE1 1
#  7:  1   CE2 1
#  8:  2   CE2 1
#  9:  2   CE3 1
# 10:  2   AE3 2
# 11:  3  E2E2 2
# 12:  3   CE2 1
# 13:  3    CC 2
# 14:  3    CD 2

data:

library(data.table)
dat <- fread("
X   73600000    73680000    B   43.6938
X   77160000    77290000    E1  38.7108
X   80450000    80630000    D   37.2528
X   8720000     8790000     E2  39.9114
X   99940000    100025000   B   41.2118
1   10000000    10100000    E2  46.975
1   100030000   100130000   E1  37.717
1   101080000   101330000   C   38.064
1   101390000   101755000   E2  37.9268
2   100430000   100870000   E2  41.8766
2   100870000   101000000   C   42.1769
2   101220000   101620000   E3  45.216
2   101345000   101440000   A   44.5705
2   101620000   101830000   E3  44.3948
3   100180000   100315000   E2  40.5067
3   100610000   100920000   E2  37.9716
3   101525000   101945000   E2  38.7479
3   101710000   101950000   C   38.2996
3   101945000   102260000   C   36.2844
3   10380000    10790000    D   49.1885
3   104320000   105090000   C   35.4865
3   106990000   107160000   C   38.3465")

How to find common element pairwise by rows from the same data frame in R

The trick is to convert to a long table that contains all name and element pairs:

Convert to long table containing columns name, variable, and element:

library(dplyr)
library(tidyr)

# Note that you also use the native pipe |> on R >= 4.1
data = data %>% 
  # Add index to data
  mutate(name = row_number()) %>% 
  # Convert from wide to long
  pivot_longer(!name, names_to = 'variable', values_to = 'element') %>% 
  # Remove any NAs
  drop_na()

This give an output that looks like:

  name variable element
1    1     col1      a1
2    2     col1      c2
3    3     col1      a3
4    4     col1      d4
5    1     col2      c2
6    2     col2      v5
7    3     col2      d4
8    4     col2      x6

We can then find all duplicates by grouping by element and filtering for duplicates:

dups = longdata %>% 
  select(-variable) %>% 
  group_by(element) %>% 
  mutate(numdups = n()) %>% 
  filter(numdups > 1) %>% 
  select(-numdups)

All pairs can then be found with an inner join by element. This matches all entries, so we have to filter out joins to the same name (e.g. (name1, name2) = (1,1):

dups %>% inner_join(dups, by = 'element') %>% 
  filter(name.x < name.y) %>% 
  select (name1 = name.x, name2 = name.y, element)

How to create a new data table based on pairwise combinations of a subset of column names?

If you can use a data frame, the below will give you the currently most speed and memory efficient approach (see benchmark wiki).

I think the approach using combn() seems reasonable to me. And I don't really think it's iterating over the combinations 18 times, as has been purported. Moreover, I personally find this easier to read than the data table melt version, but this is probably because I'm not used to data.table syntax.

Note: using this on a data table is apparently not efficient. If you really need a data.table, r2evans solution is better.

fun2 <- function(data, ID.cols){
  ids <-  which(colnames(data) %in% ID.cols)
  ## you can loop over the combinations directly
  new_dat <- combn(data[-ids], 2, function(x) {
    new_x <- setNames(x, paste("value", c("left", "right"), sep = "."))
    ## use paste with collapse for the ID.new
    new_x$ID.new <- paste(names(x), collapse = " - ")
    new_x
  }, simplify = FALSE)

## bind it with the old ID columns, outside the loop (bit faster)
  cbind(do.call(rbind, new_dat), data[ids])
}

fun2(DT,ID.cols = c("ID1", "ID2"))
#>    value.left value.right        ID.new ID1 ID2
#> 1          10           7 NAME1 - NAME2   A   1
#> 2          11           9 NAME1 - NAME2   A   2
#> 3           9           8 NAME1 - NAME2   A   3
#> 4          22          20 NAME1 - NAME2   B   1
#> 5          25          22 NAME1 - NAME2   B   2
#> 6          22          21 NAME1 - NAME2   B   3
#> 7          10          10 NAME1 - NAME3   A   1
#> 8          11          12 NAME1 - NAME3   A   2
#> 9           9          11 NAME1 - NAME3   A   3
#> 10         22          15 NAME1 - NAME3   B   1
#> 11         25          19 NAME1 - NAME3   B   2
#> 12         22          30 NAME1 - NAME3   B   3
#> 13          7          10 NAME2 - NAME3   A   1
#> 14          9          12 NAME2 - NAME3   A   2
#> 15          8          11 NAME2 - NAME3   A   3
#> 16         20          15 NAME2 - NAME3   B   1
#> 17         22          19 NAME2 - NAME3   B   2
#> 18         21          30 NAME2 - NAME3   B   3

For a benchmark, please see the community wiki.

Python Pandas Pairwise Frequency Table with many columns

Clip positive values to 1 with clip_upper, and then compute the dot product:

i = df.clip_upper(1)
j = i.T.dot(i)

j

    C1  C2  C3  C4
C1   3   1   1   2
C2   1   2   0   2
C3   1   0   2   1
C4   2   2   1   4

How to Calculate a Table of Pairwise Counts from Long-Form Data Frame