Build Word Co-Occurence Edge List in R

build word co-occurence edge list in R

It's convoluted so there's got to be a better approach:

dat <- read.csv(text="sentence_id, text
1,           a b c d e
2,           a b b e
3,           b c d
4,           a e", header=TRUE)

library(qdapTools); library(tidyr)
x <- t(mtabulate(with(dat, by(text, sentence_id, bag_o_words))) > 0)
out <- x %*% t(x)
out[upper.tri(out, diag=TRUE)] <- NA

out2 <- matrix2df(out, "word1") %>%
    gather(word2, freq, -word1) %>%
    na.omit() 

rownames(out2) <- NULL
out2

##    word1 word2 freq
## 1      b     a    2
## 2      c     a    1
## 3      d     a    1
## 4      e     a    3
## 5      c     b    2
## 6      d     b    2
## 7      e     b    2
## 8      d     c    2
## 9      e     c    1
## 10     e     d    1

Base only solution

out <- lapply(with(dat, split(text, sentence_id)), function(x) {
    strsplit(gsub("^\\s+|\\s+$", "", as.character(x)), "\\s+")[[1]]
})

nms <- sort(unique(unlist(out)))

out2 <- lapply(out, function(x) {
    as.data.frame(table(x), stringsAsFactors = FALSE)
})

dat2 <- data.frame(x = nms)

for(i in seq_along(out2)) {
    m <- merge(dat2, out2[[i]], all.x = TRUE)
    names(m)[i + 1] <- dat[["sentence_id"]][i]
    dat2 <- m
}

dat2[is.na(dat2)] <- 0
x <- as.matrix(dat2[, -1]) > 0

out3 <- x %*% t(x)
out3[upper.tri(out3, diag=TRUE)] <- NA
dimnames(out3) <- list(dat2[[1]], dat2[[1]])

out4 <- na.omit(data.frame( 
        word1 = rep(rownames(out3), ncol(out3)),  
        word2 = rep(colnames(out3), each = nrow(out3)),
        freq = c(unlist(out3)),
        stringsAsFactors = FALSE)
)

row.names(out4) <- NULL

out4

Creating co-occurrence matrix

I'd use a combination of the reshape2 package and matrix algebra:

#read in your data
dat <- read.table(text="TrxID Items Quant
Trx1 A 3
Trx1 B 1
Trx1 C 1
Trx2 E 3
Trx2 B 1
Trx3 B 1
Trx3 C 4
Trx4 D 1
Trx4 E 1
Trx4 A 1
Trx5 F 5
Trx5 B 3
Trx5 C 2
Trx5 D 1", header=T)

#making the boolean matrix   
library(reshape2)
dat2 <- melt(dat)
w <- dcast(dat2, Items~TrxID)
x <- as.matrix(w[,-1])
x[is.na(x)] <- 0
x <- apply(x, 2,  function(x) as.numeric(x > 0))  #recode as 0/1
v <- x %*% t(x)                                   #the magic matrix 
diag(v) <- 0                                      #repalce diagonal
dimnames(v) <- list(w[, 1], w[,1])                #name the dimensions
v

For the graphing maybe...

g <- graph.adjacency(v, weighted=TRUE, mode ='undirected')
g <- simplify(g)
# set labels and degrees of vertices
V(g)$label <- V(g)$name
V(g)$degree <- degree(g)
plot(g)

Count co-occurrences of two words but the order is not important in r

We may use pmin/pmax to sort the columns by row before applying the count

library(tidytext)
library(dplyr)
library(stringr)
library(tidyr)
enframe(c("a b c a d e")) %>% 
  unnest_tokens(skipgram, value, token = "skip_ngrams", n = 5) %>% 
  mutate(n_words = str_count(skipgram, pattern = "\\S+")) %>%
  filter(n_words == 2) %>% 
  separate(col = skipgram, into = c("word1", "word2"), 
      sep = "\\s+") %>%
  transmute(word11 = pmin(word1, word2), word22 = pmax(word1, word2)) %>%
  count(word11, word22)

-output

# A tibble: 7 × 3
  word11 word22     n
  <chr>  <chr>  <int>
1 a      b          2
2 a      c          2
3 a      d          1
4 a      e          1
5 b      c          1
6 c      d          1
7 d      e          1

R- Word co-occurrence frequency within paragraph

The answer is first to reshape the corpus into paragraphs, so that the new "documents" are then paragraphs from the original documents, and then compute the fcm with a "document" co-occurrence context.

Here's an example you can adapt, using the first three documents from the built-in inaugural address corpus.

library("quanteda")
## Package version: 2.0.1

data_corpus_inauguralpara <-
  corpus_reshape(data_corpus_inaugural[1:3], to = "paragraphs")
summary(data_corpus_inauguralpara)
## Corpus consisting of 23 documents, showing 23 documents:
## 
##               Text Types Tokens Sentences Year  President FirstName      Party
##  1789-Washington.1     8     11         1 1789 Washington    George       none
##  1789-Washington.2   184    341         5 1789 Washington    George       none
##  1789-Washington.3   192    328         6 1789 Washington    George       none
##  1789-Washington.4   214    391         5 1789 Washington    George       none
##  1789-Washington.5   120    182         2 1789 Washington    George       none
##  1789-Washington.6   102    164         4 1789 Washington    George       none
##  1789-Washington.7    88    120         1 1789 Washington    George       none
##  1793-Washington.1    47     64         2 1793 Washington    George       none
##  1793-Washington.2    61     83         2 1793 Washington    George       none
##       1797-Adams.1   114    180         2 1797      Adams      John Federalist
##       1797-Adams.2    88    137         3 1797      Adams      John Federalist
##       1797-Adams.3    63    101         1 1797      Adams      John Federalist
##       1797-Adams.4    60     82         3 1797      Adams      John Federalist
##       1797-Adams.5   145    277         6 1797      Adams      John Federalist
##       1797-Adams.6    62    108         2 1797      Adams      John Federalist
##       1797-Adams.7    16     17         1 1797      Adams      John Federalist
##       1797-Adams.8   158    303         8 1797      Adams      John Federalist
##       1797-Adams.9    97    184         4 1797      Adams      John Federalist
##      1797-Adams.10    80    128         1 1797      Adams      John Federalist
##      1797-Adams.11    74    119         3 1797      Adams      John Federalist
##      1797-Adams.12   329    808         1 1797      Adams      John Federalist
##      1797-Adams.13    51     75         1 1797      Adams      John Federalist
##      1797-Adams.14    41     58         1 1797      Adams      John Federalist

You can see here how the documents are now paragraphs. Now, tokenize it and add your own manipulations to the tokens (you had several in your question), and then compute the fcm.

# add your own additional manipulation of tokens here: compounding, etc
toks <- data_corpus_inauguralpara %>%
  tokens(remove_punct = TRUE) %>%
  tokens_remove(stopwords("en"))

# this creates the fcm within paragraph
fcmat <- fcm(toks, context = "document")
fcmat
## Feature co-occurrence matrix of: 1,093 by 1,093 features.
##                  features
## features          Fellow-Citizens Senate House Representatives Among
##   Fellow-Citizens               0      1     1               1     0
##   Senate                        0      0     1               1     0
##   House                         0      0     0               2     0
##   Representatives               0      0     0               0     0
##   Among                         0      0     0               0     0
##   vicissitudes                  0      0     0               0     0
##   incident                      0      0     0               0     0
##   life                          0      0     0               0     0
##   event                         0      0     0               0     0
##   filled                        0      0     0               0     0
##                  features
## features          vicissitudes incident life event filled
##   Fellow-Citizens            0        0    0     0      0
##   Senate                     0        0    0     0      0
##   House                      0        0    0     0      0
##   Representatives            0        0    0     0      0
##   Among                      1        1    1     1      1
##   vicissitudes               0        1    1     1      1
##   incident                   0        0    1     1      1
##   life                       0        0    1     1      1
##   event                      0        0    0     0      1
##   filled                     0        0    0     0      0
## [ reached max_feat ... 1,083 more features, reached max_nfeat ... 1,083 more features ]

How to generate an igraph-compatible edge set in R from data

I am not quite sure what you mean when you say " I really need the edges to be the actual number of co-occurrences rather than a correlation coefficient". However, " I'm trying to generate an edge list for an igraph from it that connects each word based on its co-occurrence in a paragraph" seems pretty clear. I interpret that to mean that if two words are in the same paragraph, they get linked. You can make that kind of edgelist using combn like this:

Edges = c()
for(p in unique(data$paragraph)) { 
    Edges = c(Edges, word[combn(which(data$paragraph == p), 2)]) }
EL = matrix(Edges, ncol=2, byrow=T)

library(igraph)

g = graph_from_edgelist(EL, directed=FALSE)
plot(g)

Graph from paragraphs

Compare feature co-Occurrence against significant co-occurrences

Like this? Remove the select() command if you prefer to keep all of the columns.

library("quanteda")
## Package version: 2.1.2

colls <- textstat_collocations(data_corpus_inaugural[1:5], size = 2)
head(colls)
##   collocation count count_nested length   lambda        z
## 1      of the    98            0      2 1.494207 11.89704
## 2    has been     9            0      2 5.691667 11.61596
## 3      i have    15            0      2 3.754144 11.51091
## 4      may be    14            0      2 4.072366 11.43632
## 5   have been    10            0      2 4.679873 10.94315
## 6     we have     9            0      2 4.458284 10.35023

as.data.frame(colls) %>%
  tidyr::separate("collocation", into = c("word1", "word2"), sep = " ") %>%
  dplyr::select(word1, word2, lambda) %>%
  tibble::tibble()
## # A tibble: 678 x 3
##    word1   word2   lambda
##    <chr>   <chr>    <dbl>
##  1 of      the       1.49
##  2 has     been      5.69
##  3 i       have      3.75
##  4 may     be        4.07
##  5 have    been      4.68
##  6 we      have      4.46
##  7 foreign nations   6.32
##  8 it      is        3.50
##  9 my      country   4.49
## 10 united  states    7.22
## # … with 668 more rows

Build Word Co-Occurence Edge List in R