Constructing a Co-Occurrence Matrix in Python Pandas

Constructing a co-occurrence matrix in python pandas

It's a simple linear algebra, you multiply matrix with its transpose (your example contains strings, don't forget to convert them to integer):

>>> df_asint = df.astype(int)
>>> coocc = df_asint.T.dot(df_asint)
>>> coocc
       Dop  Snack  Trans
Dop      4      2      3
Snack    2      3      2
Trans    3      2      4

if, as in R answer, you want to reset diagonal, you can use numpy's fill_diagonal:

>>> import numpy as np
>>> np.fill_diagonal(coocc.values, 0)
>>> coocc
       Dop  Snack  Trans
Dop      0      2      3
Snack    2      0      2
Trans    3      2      0

How do I create a co-occurrance matrix in Python?

`DataFrame.corr`

We can define a custom callable function for calculating the correlation between the columns of the dataframe, this callable takes two 1D numpy arrays as its input arguments and return's the count of the number of times the elements in these two arrays equal to each other

df.corr(method=lambda x, y: (x==y).sum())

     A    B    C
A  1.0  2.0  3.0
B  2.0  1.0  2.0
C  3.0  2.0  1.0

Cooccurence matrix from pandas dataframe

WE can do stack then get_dummies and dot then value

s=df.stack().str.get_dummies().sum(level=0).ne(0).astype(int)
s=s.T.dot(s).astype(float)
np.fill_diagonal(s.values, np.nan)
s
Out[33]: 
     A    B    C    D
A  NaN  2.0  2.0  1.0
B  2.0  NaN  2.0  0.0
C  2.0  2.0  NaN  1.0
D  1.0  0.0  1.0  NaN

Construct co-occurrence matrix from a list of list in python

Here's a way to do it:

l1=['a', 'b']
l2=['b', 'c', 'd', 'e']
l3=['a', 'd', 'e']
l4=['b', 'e']

Get a nested list from the lists:

l = [i for i in [l1,l2,l3,l4]]

Get all the combinations within each list using itertools.combinations:

c = [list(itertools.combinations(i,2)) for i in l]
#[[('a', 'b')],
#[('b', 'c'), ('b', 'd'), ('b', 'e'), ('c', 'd'), ('c', 'e'), ('d', 'e')],
#[('a', 'd'), ('a', 'e'), ('d', 'e')],
#[('b', 'e')]]

Flatten the nested lists. Note that each element is added in original and reversed order using chain.from_iterable((i, i[::-1]).

a = list(chain.from_iterable((i, i[::-1]) for c_ in c for i in c_))

Use pivot_table and aggregate by size to generate a cooccurrence matrix from the result

df = pd.DataFrame(a)
pd.pivot_table(df, index=0, columns=1, aggfunc='size', fill_value=0)

1    a    b    c    d    e
0                         
a  0.0  1.0  0.0  1.0  1.0
b  1.0  0.0  1.0  1.0  2.0
c  0.0  1.0  0.0  1.0  1.0
d  1.0  1.0  1.0  0.0  2.0
e  1.0  2.0  1.0  2.0  0.0

How to create a co-occurence matrix of product orders in python?

We start by grouping the df by order_id, and within each group calculate all possible pairs. Note we sort first by product_id so the same pairs in different groups are always in the same order

import itertools
all_pairs = []
for _, group in df.sort_values('product_id').groupby('order_id'):
    all_pairs += list(itertools.combinations(group['product_id'],2))

all_pairs

we get a list of all pairs from all orders

[('3333', '365'),
 ('3333', '48750'),
 ('3333', '9877'),
 ('365', '48750'),
 ('365', '9877'),
 ('48750', '9877'),
 ('32001', '3333'),
 ('32001', '48750'),
 ('3333', '48750'),
 ('11202', '3333'),
 ('11202', '365'),
 ('11202', '365'),
 ('3333', '365'),
 ('3333', '365'),
 ('365', '365')]

Now we count duplicates

from collections import Counter

count_dict = dict(Counter(all_pairs))
count_dict

so we get the count of each pair, basically what you are after

{('3333', '365'): 3,
 ('3333', '48750'): 2,
 ('3333', '9877'): 1,
 ('365', '48750'): 1,
 ('365', '9877'): 1,
 ('48750', '9877'): 1,
 ('32001', '3333'): 1,
 ('32001', '48750'): 1,
 ('11202', '3333'): 1,
 ('11202', '365'): 2,
 ('365', '365'): 1}

Putting this back into a cross-product table is a bit of work, the key bit is spliitng the tuples into columns by calling .apply(pd.Series) and eventually moving one of the columns to the column names via unstack:

(pd.DataFrame.from_dict(count_dict, orient='index')
    .reset_index(0)
    .set_index(0)['index']
    .apply(pd.Series)
    .rename(columns = {0:'pid1',1:'pid2'})
    .reset_index()
    .rename(columns = {0:'count'})
    .set_index(['pid1', 'pid2'] )
    .unstack()
    .fillna(0))

this produces a 'compact' form of the table you are after that only includes products that appeared in at least one pair


count
pid2    3333 365    48750  9877
pid1                
11202   1.0  2.0    0.0    0.0
32001   1.0  0.0    1.0    0.0
3333    0.0  3.0    2.0    1.0
365     0.0  1.0    1.0    1.0
48750   0.0  0.0    0.0    1.0

UPDATE
Here is a rather simplified version of the above, following various discussions in the comments

import numpy as np
import pandas as pd
from collections import Counter

# we start as in the original solution but use permutations not combinations
all_pairs = []
for _, group in df.sort_values('product_id').groupby('order_id'):
    all_pairs += list(itertools.permutations(group['product_id'],2))
count_dict = dict(Counter(all_pairs))

# We create permutations for _all_ product_ids ... note we use unique() but also product(..) to allow for (365,265) combinations
total_pairs = list(itertools.product(df['product_id'].unique(),repeat = 2))

# pull out first and second elements separately
pid1 = [p[0] for p in total_pairs]
pid2 = [p[1] for p in total_pairs]

# and get the count for those permutations that exist from count_dict. Use 0
# for those that do not
count = [count_dict.get(p,0) for p in total_pairs]

# Now a bit of dataFrame magic
df_cross = pd.DataFrame({'pid1':pid1, 'pid2':pid2, 'count':count})
df_cross.set_index(['pid1','pid2']).unstack()

and we are done. df_cross below


count
pid2    11202   32001   3333    365 48750   9877
pid1                        
11202   0       0       1       2   0       0
32001   0       0       1       0   1       0
3333    1       1       0       3   2       1
365     2       0       3       2   1       1
48750   0       1       2       1   0       1
9877    0       0       1       1   1       0

Compute co-occurences in pandas dataframe for column values grouped by another column values

Optimized solution

In the end, I managed to compute cross occurrences in a memory friendly way using scipy sparse matrices for the intermediate computations:

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

def df_compute_cooccurrences(df: pd.DataFrame, column1: str, column2: str) -> pd.DataFrame:
   
    # pd.factorize encode the object as an enumerated type or categorical variable, returning:
    # - `codes` (ndarray): an integer ndarray that’s an indexer into `uniques`.
    # - `uniques` (ndarray, Index, or Categorical): the unique valid values
    # see more at https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.factorize.html

    i, rows = pd.factorize(df[column1])
    # i    -> array([     0,      0,      0, ..., 449054,      0,      1])
    # rows -> Index(['column1_label1', 'column1_label2', ...])

    j, cols = pd.factorize(df[column2])
    # j    -> array([    0,     1,     2, ..., 28544,    -1,    -1])
    # cols -> Float64Index([column2_label1, column2_label2, ...])

    ij, tups = pd.factorize(list(zip(i, j)))
    # ij   -> array([      0,       1,       2, ..., 2878026, 2878027, 2878028])
    # tups -> array([(0, 0), (0, 1), (0, 2), ..., (449054, 28544), (0, -1), (1, -1)]

    # Then we can finally compute the crosstabulation matrix
    crosstab = csr_matrix((np.bincount(ij), tuple(zip(*tups))))
    # If we convert directly this into a Dataframe with
    # pd.DataFrame.sparse.from_spmatrix(crosstab, rows, cols)
    # we have the same result as using 
    # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.crosstab.html
    # but we obtained it in a memory-friendly way (allowing big data processing)

    # In order to obtain the co-occurrences matrix for column 1, 
    # we have to multiply the crosstab matrix for its transposed 
    coocc = crosstab.dot(crosstab.transpose())
    
    # Then we can finally return the co-occurence matrix in in a DataFrame form
    return pd.DataFrame.sparse.from_spmatrix(coocc, rows, rows)

Here a small example is provided:

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

def df_compute_cooccurrences(df: pd.DataFrame, column1: str, column2: str) -> pd.DataFrame:
    i, rows = pd.factorize(df[column1])
    j, cols = pd.factorize(df[column2])
    ij, tups = pd.factorize(list(zip(i, j)))
    crosstab = csr_matrix((np.bincount(ij), tuple(zip(*tups))))
    coocc = crosstab.dot(crosstab.transpose())
    return pd.DataFrame.sparse.from_spmatrix(coocc, rows, rows)

df = pd.DataFrame(zip([1,1,1,2,2,3,4],["a","a","a","a","a","b","b"]), columns=list('xy'))
"""
df:
  +-----+-----+
  ¦  x  ¦  y  ¦
  +-----+-----+
  |  1  |  a  |
  |  1  |  a  |
  |  1  |  a  |
  |  2  |  a  |
  |  2  |  a  |
  |  3  |  b  |
  |  4  |  b  |
  +-----+-----+
"""
cooc_df = df_compute_cooccurrences(df, "x", "y")
"""
cooc_df:
    +---+---+---+---+
    ¦ 1 | 2 | 3 | 4 |
+---+---+---+---+---+
¦ 1 ¦ 9 | 6 | 0 | 0 |
¦ 2 ¦ 6 | 4 | 0 | 0 |
¦ 3 ¦ 0 | 0 | 1 | 1 |
¦ 4 ¦ 0 | 0 | 1 | 1 |
+---+---+---+---+---+
"""
cooc_df2 = df_compute_cooccurrences(df, "y", "x")
"""
cooc_df2:
    +----+----+
    ¦  a ¦  b ¦
+---+----+----+
¦ a ¦ 13 |  0 |
¦ b ¦  0 |  2 |
+---+----+----+
"""

Calculating co-occurrence between pairs of columns in pandas

You can use the following:

from itertools import combinations

df = df.astype(int)

co_occurrence = (pd.Series({(c1,c2): (df[c1]&df[c2]).sum()
                            for c1,c2 in combinations(df.columns, 2)})
                   .unstack(-1)
                )

output:

     M2   M3   M4
M1  2.0  1.0  1.0
M2  NaN  1.0  2.0
M3  NaN  NaN  1.0

Pandas/SQL co-occurrence count

Only because the question is well written and it seemed like a nice puzzle, here's some magic.

Potentially you'll have to store a lot of data, so you need to compress the frame as much as possible and do several passes through the base. If the database contains not primitive objects, convert those into integers, if you do multiprocessing, the dataframe will be copied into subprocesses, so keeping it contents small helps.

The runtime depends on the length of the dataframe but also on the number of unique stores, unique products and the size of a chunk of pairs to count. Spreading the work to many subprocesses can speed up things but there is constant cost to all the functions which will accumulate. For example, pandas' own methods will run faster on a single ten thousand rows dataframe than on a dozen of thousand row frames. And when you're running nested calls on sub dataframes of unpredictable size things get complicated. You'll probably have to experiment a bit to find a chunksize with optimal speed\memory usage.

Test runtimes with smaller numbers first. Including less shops and products. That being said, this is not a quick task. On high end machine it completes in about ten minutes.

import pandas as pd, numpy as np
df = pd.DataFrame({
  'store':np.random.randint(0,int(2e4),int(5e6)),
  'product':np.random.randint(0,int(5e4),int(5e6))
  }).sort_values('store')

products = df['product'].unique()
N, chunksize, Ntop = len(products), int(1e4), 200
dtype = np.min_scalar_type(max(products.max(),N))
df = df.astype(dtype)

def store_cats(df):
    df = df.astype('category')
    cats = [df[x].cat.categories for x in df.columns]
    for col in df.columns:
        df[col] = df[col].cat.codes
    return df, cats    
def restore_cats(summary,cats):
    for col in ['product_x','product_y']:
        summary[col] = pandas.Categorical.from_codes(summary[col], cats)

def subsets(n = chunksize):
    n = int(n)
    res = [frozenset(products[i:i+n]) for i in range(0,N,n)]
    info = 'In total there will be {:.1E} pairs, per pass {:.1E} will be checked, thats up to around {} mb per pass, {} passes'
    print(info.format((N**2),(n*N),(n*N*3*8/1e6),len(res)))
    return res

def count(df,subset):
    res = df.merge(df,on = 'store')\
        .query('(product_x < product_y) and product_x in @subset')\
        .groupby(['product_x','product_y'])\
        .count()\
        .astype(dtype)\
        .reset_index()
    return res 
def one_pass(gr,subset):
    per_group = gr.apply(count,subset)
    total_counts = per_group.sort_values(['product_x','product_y'])\
        .groupby(['product_x','product_y'])\
        .agg('sum')\
        .sort_values('store',ascending=False)[:Ntop]\
        .copy().reset_index()
    return total_counts
def merge_passes(dfs):
    res = pd.concat(dfs,ignore_index=True)
    res = res.append(res.rename(columns={'product_x':'product_y','product_y':'product_x'}),ignore_index=True)
    res = res.sort_values('store',ascending=False)[:Ntop]
    return res

from concurrent.futures import as_completed, ProcessPoolExecutor as Pool

gr = df.groupby('store',as_index = False)
def worker(subset):
    return one_pass(gr,subset)
def run_progress(max_workers=2,chunksize=chunksize):
    from tqdm.auto import tqdm 
    with Pool(max_workers = max_workers) as p:
        futures = [p.submit(worker,subset) for subset in subsets(chunksize)]
        summaries = [x.result() for x in tqdm(as_completed(futures),total=len(futures))]
        return merge_passes(summaries)

Constructing a Co-Occurrence Matrix in Python Pandas