How to Use R to Create a Word Co-Occurrence Matrix

Creating a co-occurrence matrix using r

This gives a list of matrices, one for each patient:

# fake data
set.seed(47)
m = matrix(round(runif(15)), nrow = 3)
colnames(m) = paste0("OTU", 1:ncol(m))
m
#      OTU1 OTU2 OTU3 OTU4 OTU5
# [1,]    1    1    0    1    0
# [2,]    0    1    0    0    1
# [3,]    1    1    1    1    1

template = as.data.frame(t(combn(colnames(m), 2)))
names(template) = c("otu1", "otu2")
template$counts = 0

result = apply(m, 1, function(x) {
  ones = names(x)[x == 1]
  result = template
  result[result$otu1 %in% ones & result$otu2 %in% ones, "counts"] = 1 
  return(result)
})

result
# [[1]]
#    otu1 otu2 counts
# 1  OTU1 OTU2      1
# 2  OTU1 OTU3      0
# 3  OTU1 OTU4      1
# 4  OTU1 OTU5      0
# 5  OTU2 OTU3      0
# 6  OTU2 OTU4      1
# 7  OTU2 OTU5      0
# 8  OTU3 OTU4      0
# 9  OTU3 OTU5      0
# 10 OTU4 OTU5      0
# 
# [[2]]
#    otu1 otu2 counts
# 1  OTU1 OTU2      0
# 2  OTU1 OTU3      0
# 3  OTU1 OTU4      0
# 4  OTU1 OTU5      0
# 5  OTU2 OTU3      0
# 6  OTU2 OTU4      0
# 7  OTU2 OTU5      1
# 8  OTU3 OTU4      0
# 9  OTU3 OTU5      0
# 10 OTU4 OTU5      0
# 
# [[3]]
#    otu1 otu2 counts
# 1  OTU1 OTU2      1
# 2  OTU1 OTU3      1
# 3  OTU1 OTU4      1
# 4  OTU1 OTU5      1
# 5  OTU2 OTU3      1
# 6  OTU2 OTU4      1
# 7  OTU2 OTU5      1
# 8  OTU3 OTU4      1
# 9  OTU3 OTU5      1
# 10 OTU4 OTU5      1

Creating co-occurrence matrix

I'd use a combination of the reshape2 package and matrix algebra:

#read in your data
dat <- read.table(text="TrxID Items Quant
Trx1 A 3
Trx1 B 1
Trx1 C 1
Trx2 E 3
Trx2 B 1
Trx3 B 1
Trx3 C 4
Trx4 D 1
Trx4 E 1
Trx4 A 1
Trx5 F 5
Trx5 B 3
Trx5 C 2
Trx5 D 1", header=T)

#making the boolean matrix   
library(reshape2)
dat2 <- melt(dat)
w <- dcast(dat2, Items~TrxID)
x <- as.matrix(w[,-1])
x[is.na(x)] <- 0
x <- apply(x, 2,  function(x) as.numeric(x > 0))  #recode as 0/1
v <- x %*% t(x)                                   #the magic matrix 
diag(v) <- 0                                      #repalce diagonal
dimnames(v) <- list(w[, 1], w[,1])                #name the dimensions
v

For the graphing maybe...

g <- graph.adjacency(v, weighted=TRUE, mode ='undirected')
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
plot(g)

Create Co-occurrence matrix with bigrams

Just specify your bigrams and create the co-occurence matrices. Below are some (really) simple examples. Choose 1 package and do everything with that one. Both quanteda and text2vec can use multiple cores / threads. Traversing over the resulting co-occurence matrices can be done with reshape2::melt, like this reshape2::melt(as.matrix(my_cooccurence_matrix)).

txt <- c("The quick brown fox jumped over the lazy dog.",
         "The dog jumped and ate the fox.")

using quanteda to create a feature co-occurrence matrix:

library(quanteda)
toks <- tokens(char_tolower(txt), remove_punct = TRUE, ngrams = 2)
f <- fcm(toks, context = "document")

Feature co-occurrence matrix of: 14 by 14 features.
14 x 14 sparse Matrix of class "fcm"
             features
features      the_quick quick_brown brown_fox fox_jumped jumped_over over_the the_lazy lazy_dog the_dog dog_jumped jumped_and and_ate
  the_quick           0           1         1          1           1        1        1        1       0          0          0       0
  quick_brown         0           0         1          1           1        1        1        1       0          0          0       0
  brown_fox           0           0         0          1           1        1        1        1       0          0          0       0
  fox_jumped          0           0         0          0           1        1        1        1       0          0          0       0
  jumped_over         0           0         0          0           0        1        1        1       0          0          0       0
  over_the            0           0         0          0           0        0        1        1       0          0          0       0
  the_lazy            0           0         0          0           0        0        0        1       0          0          0       0
  lazy_dog            0           0         0          0           0        0        0        0       0          0          0       0
  the_dog             0           0         0          0           0        0        0        0       0          1          1       1
  dog_jumped          0           0         0          0           0        0        0        0       0          0          1       1
  jumped_and          0           0         0          0           0        0        0        0       0          0          0       1
  and_ate             0           0         0          0           0        0        0        0       0          0          0       0
  ate_the             0           0         0          0           0        0        0        0       0          0          0       0
  the_fox             0           0         0          0           0        0        0        0       0          0          0       0
             features
features      ate_the the_fox
  the_quick         0       0
  quick_brown       0       0
  brown_fox         0       0
  fox_jumped        0       0
  jumped_over       0       0
  over_the          0       0
  the_lazy          0       0
  lazy_dog          0       0
  the_dog           1       1
  dog_jumped        1       1
  jumped_and        1       1
  and_ate           1       1
  ate_the           0       1
  the_fox           0       0

using text2vec to create a feature co-occurrence matrix:

library(text2vec)
i <- itoken(txt)
v <- create_vocabulary(i, ngram = c(2L, 2L))
vectorizer <- vocab_vectorizer(v) 
f2 <- create_tcm(i, vectorizer)

14 sparse Matrix of class "dgTMatrix"
   [[ suppressing 14 column names ‘the_lazy’, ‘and_ate’, ‘The_quick’ ... ]]

the_lazy    . . . 0.25 1.0 . 0.2 0.3333333 .         .   1.0000000 .         0.5000000 .        
and_ate     . . . .    .   1 .   .         0.5000000 1.0 .         0.3333333 .         0.5000000
The_quick   . . . 0.50 .   . 1.0 0.3333333 .         .   0.2000000 .         0.2500000 .        
brown_fox   . . . .    0.2 . 1.0 1.0000000 .         .   0.3333333 .         0.5000000 .        
lazy_dog.   . . . .    .   . .   0.2500000 .         .   0.5000000 .         0.3333333 .        
jumped_and  . . . .    .   . .   .         0.3333333 0.5 .         0.5000000 .         1.0000000
quick_brown . . . .    .   . .   0.5000000 .         .   0.2500000 .         0.3333333 .        
fox_jumped  . . . .    .   . .   .         .         .   0.5000000 .         1.0000000 .        
the_fox.    . . . .    .   . .   .         .         1.0 .         0.2000000 .         0.2500000
ate_the     . . . .    .   . .   .         .         .   .         0.2500000 .         0.3333333
over_the    . . . .    .   . .   .         .         .   .         .         1.0000000 .        
The_dog     . . . .    .   . .   .         .         .   .         .         .         1.0000000
jumped_over . . . .    .   . .   .         .         .   .         .         .         .        
dog_jumped  . . . .    .   . .   .         .         .   .         .         .         .

R- Word co-occurrence frequency within paragraph

The answer is first to reshape the corpus into paragraphs, so that the new "documents" are then paragraphs from the original documents, and then compute the fcm with a "document" co-occurrence context.

Here's an example you can adapt, using the first three documents from the built-in inaugural address corpus.

library("quanteda")
## Package version: 2.0.1

data_corpus_inauguralpara <-
  corpus_reshape(data_corpus_inaugural[1:3], to = "paragraphs")
summary(data_corpus_inauguralpara)
## Corpus consisting of 23 documents, showing 23 documents:
## 
##               Text Types Tokens Sentences Year  President FirstName      Party
##  1789-Washington.1     8     11         1 1789 Washington    George       none
##  1789-Washington.2   184    341         5 1789 Washington    George       none
##  1789-Washington.3   192    328         6 1789 Washington    George       none
##  1789-Washington.4   214    391         5 1789 Washington    George       none
##  1789-Washington.5   120    182         2 1789 Washington    George       none
##  1789-Washington.6   102    164         4 1789 Washington    George       none
##  1789-Washington.7    88    120         1 1789 Washington    George       none
##  1793-Washington.1    47     64         2 1793 Washington    George       none
##  1793-Washington.2    61     83         2 1793 Washington    George       none
##       1797-Adams.1   114    180         2 1797      Adams      John Federalist
##       1797-Adams.2    88    137         3 1797      Adams      John Federalist
##       1797-Adams.3    63    101         1 1797      Adams      John Federalist
##       1797-Adams.4    60     82         3 1797      Adams      John Federalist
##       1797-Adams.5   145    277         6 1797      Adams      John Federalist
##       1797-Adams.6    62    108         2 1797      Adams      John Federalist
##       1797-Adams.7    16     17         1 1797      Adams      John Federalist
##       1797-Adams.8   158    303         8 1797      Adams      John Federalist
##       1797-Adams.9    97    184         4 1797      Adams      John Federalist
##      1797-Adams.10    80    128         1 1797      Adams      John Federalist
##      1797-Adams.11    74    119         3 1797      Adams      John Federalist
##      1797-Adams.12   329    808         1 1797      Adams      John Federalist
##      1797-Adams.13    51     75         1 1797      Adams      John Federalist
##      1797-Adams.14    41     58         1 1797      Adams      John Federalist

You can see here how the documents are now paragraphs. Now, tokenize it and add your own manipulations to the tokens (you had several in your question), and then compute the fcm.

# add your own additional manipulation of tokens here: compounding, etc
toks <- data_corpus_inauguralpara %>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(stopwords("en"))

# this creates the fcm within paragraph
fcmat <- fcm(toks, context = "document")
fcmat
## Feature co-occurrence matrix of: 1,093 by 1,093 features.
##                  features
## features          Fellow-Citizens Senate House Representatives Among
##   Fellow-Citizens               0      1     1               1     0
##   Senate                        0      0     1               1     0
##   House                         0      0     0               2     0
##   Representatives               0      0     0               0     0
##   Among                         0      0     0               0     0
##   vicissitudes                  0      0     0               0     0
##   incident                      0      0     0               0     0
##   life                          0      0     0               0     0
##   event                         0      0     0               0     0
##   filled                        0      0     0               0     0
##                  features
## features          vicissitudes incident life event filled
##   Fellow-Citizens            0        0    0     0      0
##   Senate                     0        0    0     0      0
##   House                      0        0    0     0      0
##   Representatives            0        0    0     0      0
##   Among                      1        1    1     1      1
##   vicissitudes               0        1    1     1      1
##   incident                   0        0    1     1      1
##   life                       0        0    1     1      1
##   event                      0        0    0     0      1
##   filled                     0        0    0     0      0
## [ reached max_feat ... 1,083 more features, reached max_nfeat ... 1,083 more features ]

How to represent each word occurrence as a separate tcm vector in R?

quanteda's fcm is a very efficient way to crate feature co-occurrence matrices wither at the document level or within a user-defined context. This results in a sparse, symmetric feature-by-feature matrix. But it sounds like you want each unique feature to be its own row, and have its target words around that.

It looks from the example that you want a context window of +/- 2 words, so I have done that for the target word "short".

First, we get the context using keywords-in-context:

library("quanteda")
txt <- c("here is a short document", "here is a different short document")

(shortkwic <- kwic(txt, "short", window = 2))
#                                          
# [text1, 4]        is a | short | document
# [text2, 5] a different | short | document

Then create a corpus from the context, with the keyword as a unique document name:

shortcorp <- corpus(shortkwic, split_context = FALSE, extract_keyword = TRUE)
docnames(shortcorp) <- make.unique(docvars(shortcorp, "keyword"))
texts(shortcorp)
#                 short                      short.1 
# "is a short document" "a different short document"

Then create a dfm, selecting all words, but removing the target:

dfm(shortcorp) %>%
  dfm_select(dfm(txt)) %>%
  dfm_remove("short")
# Document-feature matrix of: 2 documents, 5 features (40% sparse).
# 2 x 5 sparse Matrix of class "dfm"
#          features
# docs      here is a document different
#   short      0  1 1        1         0
#   short.1    0  0 1        1         1