Identify Groups of Linked Episodes Which Chain Together

identify groups of linked episodes which chain together

The Bioconductor package RBGL (an R interface to the BOOST graph library) contains
a function, connectedComp(), which identifies the connected components in a graph --
just what you are wanting.

(To use the function, you will first need to install the graph and RBGL packages, available here and here.)

library(RBGL)
test <- data.frame(id1=c(10,10,1,1,24,8),id2=c(1,36,24,45,300,11))

## Convert your 'from-to' data to a 'node and edge-list' representation
## used by the 'graph' & 'RBGL' packages
g <- ftM2graphNEL(as.matrix(test))

## Extract the connected components
cc <- connectedComp(g)

## Massage results into the format you're after
ld <- lapply(seq_along(cc),
function(i) data.frame(group = names(cc)[i], id = cc[[i]]))
do.call(rbind, ld)
# group id
# 1 1 10
# 2 1 1
# 3 1 24
# 4 1 36
# 5 1 45
# 6 1 300
# 7 2 8
# 8 2 11

Search string sequentially in R

You can do this with a simple recursive function that jumps to the next id2for each entry. But you have to be cautious not to include circular references in id1 and id2. Otherwise, you get endless recursion:

dscan = function(df,init=101){
ni = (1:dim(df)[1])[df$id2==init & !is.na(df$id2)][1] ## Get the next line of df that fulfills the condition that id2 is the current id1
nv = c(df$date[df$id1==init]) ## Current date
if(!is.na(ni)>0){
nx = df$id1[ni[1]] ## Next index
return(c(nv,dscan(df,nx))) ## Recursion step
} else {return(c(nv))} ## Abort recursion if there is no next ni
}

The output would be:

> dscan(df,101)
[1] "01.1.2021" "12.1.2021" "17.1.2021" "18.1.2021"
> dscan(df,107)
[1] "10.1.2021" "11.1.2021"
> dscan(df,108)
[1] "11.1.2021"

R tidyverse: unique identifier for union of the sets of two columns

Perhaps, we can use igraph

library(dplyr)
library(igraph)
df %>%
select(-id) %>%
graph_from_data_frame %>%
clusters %>%
pluck(membership) -> cls
df %>%
mutate(group_size = cls[class1])
# A tibble: 6 x 4
# id class1 class2 group_size
# <dbl> <chr> <chr> <dbl>
#1 1 A L1 1
#2 2 A L1 1
#3 3 B L1 1
#4 4 B L2 1
#5 5 C L3 2
#6 6 D L4 3

Grouping similar elements together

This is a long one but you could do:

library(tidyverse)
library(igraph)

df %>%
select(Names)%>%
distinct() %>%
separate(Names, c('first', 'second'), extra = 'merge', fill = 'right')%>%
separate_rows(second) %>%
mutate(second = coalesce(second, as.character(cumsum(is.na(second)))))%>%
graph_from_data_frame()%>%
components()%>%
getElement('membership')%>%
imap(~str_detect(df$Names, .y)*.x) %>%
invoke(pmax, .)%>%
cbind(df, value = LETTERS[.], value1 = .)

Names Initial_Group Final_Group value value1
1 James,Gordon 6 A A 1
2 James,Gordon 6 A A 1
3 James,Gordon 6 A A 1
4 James,Gordon 6 A A 1
5 James,Gordon 6 A A 1
6 James,Gordon 6 A A 1
7 Amanda 1 A A 1
8 Amanda 1 A A 1
9 Amanda 1 A A 1
10 Gordon,Amanda 5 A A 1
11 Gordon,Amanda 5 A A 1
12 Gordon,Amanda 5 A A 1
13 Gordon,Amanda 5 A A 1
14 Gordon,Amanda 5 A A 1
15 Gordon,Amanda 5 A A 1
16 Gordon,Amanda 5 A A 1
17 Gordon,Amanda 5 A A 1
18 Edward,Gordon,Amanda 4 A A 1
19 Edward,Gordon,Amanda 4 A A 1
20 Edward,Gordon,Amanda 4 A A 1
21 Anna 2 B B 2
22 Anna 2 B B 2
23 Anna 2 B B 2
24 Anna,Leonard 3 B B 2
25 Anna,Leonard 3 B B 2
26 Anna,Leonard 3 B B 2

Check the column called value

R: Identifying Data Frame Rows Connected By Shared Values In Two Columns

## helper function for merging vector elements of a list
merge.elems <- function(x,i,j) {
c(
x[seq_len(i-1L)], ## before i
list(unique(c(x[[i]],x[[j]]))), ## combined i,j
x[seq_len(j-i-1L)+i], ## between i,j
x[seq_len(length(x)-j)+j] ## after j
);
}; ## end merge.elems()

## initialize row groups and value groups
rgs <- as.list(seq_len(nrow(df)));
vgs <- do.call(Map,c(c,unname(df[1:2])));

## if there are 2 or more groups, exhaustively merge overlapping value group pairs
if (length(rgs)>1L) {
i <- 1L;
j <- 2L;
repeat {
if (any(vgs[[i]]%in%vgs[[j]])) {
rgs <- merge.elems(rgs,i,j);
vgs <- merge.elems(vgs,i,j);
j <- i+1L;
if (j>length(rgs)) break;
} else {
j <- j+1L;
if (j>length(rgs)) {
i <- i+1L;
if (i==length(rgs)) break;
j <- i+1L;
}; ## end if
}; ## end if
}; ## end repeat
}; ## end if

## results
rgs;
## [[1]]
## [1] 1 2 3 4
##
## [[2]]
## [1] 5 6
##
## [[3]]
## [1] 7
##
vgs;
## [[1]]
## [1] 1 2 3 4 5
##
## [[2]]
## [1] 6 8 7
##
## [[3]]
## [1] 9 10
##

Convert transitive connections of elements into groups in R or SQL

You data is a graph, defined by the list of its edges,
and you want its connected components.
This is what the clusters function in the igraph package computes.

# Sample data
d <- structure(c("A", "B", "C", "C", "E", "I", "H", "J", "K", "B",
"C", "D", "G", "F", "E", "G", "K", "L"), .Dim = c(9L, 2L), .Dimnames = list(
NULL, c("e1", "e2")))

library(igraph)
g <- graph.edgelist( as.matrix(d) )
clusters(d)
# $membership
# [1] 1 1 1 1 1 2 2 2 1 3 3 3


Related Topics



Leave a reply



Submit