Processing The Input File Based on Range Overlap

Processing the input file based on range overlap

To do this requires many steps, and a number of concepts from the data.table package, most notably non-equi joins. I've commented the code throughout, and recommend running it step by step if you want more insight:

library(data.table)

input <- structure(list(CT1 = structure(1:3, .Label =
c("chr1:200-400", "chr1:800-970", "chr2:300-700"), class =
"factor"), CT2 = structure(1:3, .Label = c("chr1:250-450",
"chr2:200-500", "chr2:600-1000"), class = "factor"), CT3 =
structure(1:3, .Label = c("chr1:400-800", "chr1:700-870",
"chr2:700-1400"), class = "factor")), .Names = c("CT1",
"CT2", "CT3"), class = "data.frame", row.names = c(NA, -3L))

output <- structure(list(CT1 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L),
CT2 = c(1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L), CT3 = c(0L, 0L, 0L,
0L, 0L, 1L, 1L, 1L, 1L)), .Names = c("CT1", "CT2", "CT3"), class =
"data.frame", row.names = c("chr1:200-400", "chr1:800-970",
"chr2:300-700", "chr1:250-450", "chr2:200-500", "chr2:600-1000",
"chr1:400-800", "chr1:700-870", "chr2:700-1400"))

# Builds a data.table by breaking a string like "chr1:300-700" into
# three columns: chr, start, and end.
split_genomic_range <- function(str) {
chr <- gsub(":.*", "", str)
start <- gsub("-.*", "", gsub(".*:", "", str))
end <- gsub(".*-", "", str)

start <- as.numeric(start)
end <- as.numeric(end)

return(data.table(chr=chr, start=start, end=end))
}

# First break the input data.table into three new tables - we will need
# to perform non-equi joins of the index table (column CT1 in input) to
# the tables built from the other two columns.
ct1 <- split_genomic_range(input$CT1)
ct2 <- split_genomic_range(input$CT2)
ct3 <- split_genomic_range(input$CT3)

# Create an index table with all genomic ranges, then check for
# overlaps in each of the three tables created from the input
# columns:
index_table <- unique(rbind(ct1, ct2, ct3))

# Returns entries from the index_table if they overlap > 50% any
# entries in the lookup table or vice-versa
get_overlapping_ranges <- function(index_table, lookup_table) {
# This function does two non-equi joins. First, it checks whether
# any entries in the index_table have a 50% overlap with any
# entries in the lookup table. Second, it does the reverse, and
# checks whether any entries in the lookup_table have a 50% overlap
# with any entries in the index_table. This is required due to
# differing window sizes:
# e.g. 0-20 significantly overlaps 10-100, but 10-100 does not
# significantly overlap 0-20.

# We will need to create a "middle" column for each genomic range.
# We will need to create copies of each table to do this, otherwise
# they will end up with this new column as a side effect of the
# function call.
index_copy <- copy(index_table)
lookup_copy <- copy(lookup_table)

index_copy[, middle := start + (end - start) * 0.5]
lookup_copy[, middle := start + (end - start) * 0.5]

# In the index_copy we will also need to create dummy columns for
# the check. We need to do this so we can find the appropriate
# genomic window from the index table when we do the second
# non-equi join, otherwise the start and end columns will be
# clobbered.
index_copy[, start_index := start]
index_copy[, end_index := end]

# If the middle of a genomic range in the index table falls within
# a genomic range in the lookup table, then that genomic entry from
# the index table has a significant overlap with the corresponding
# in the lookup table
index_overlaps <- index_copy[lookup_table,
on=.(chr, middle >= start, middle <= end),
nomatch=0]

# Test the reverse: does any entry in the lookup table
# significantly overlap with any of the genomic windows in the
# index table?
lookup_overlaps <- index_copy[lookup_copy,
on=.(chr, start_index <= middle, end_index >= middle),
nomatch=0]

# Remove extra columns created by the non-equi join:
index_overlaps <- index_overlaps[,.(chr, start, end)]
lookup_overlaps <- lookup_overlaps[,.(chr, start, end)]

# Combine results and remove any duplicates that arise, e.g.
# because a genomic window in the index_table significantly
# overlaps with multiple genomic windows in the lookup table, or
# because the overlap is significant in both comparisons (i.e.
# where both windows are the same size).
overlaps <- unique(rbind(index_overlaps, lookup_overlaps))

return(overlaps)
}

ranges_in_ct1 <- get_overlapping_ranges(index_table, ct1)
ranges_in_ct2 <- get_overlapping_ranges(index_table, ct2)
ranges_in_ct3 <- get_overlapping_ranges(index_table, ct3)

# Combine results, indicating which column each genomic range was
# found to overlap:
overlaps <- rbind(
CT1=ranges_in_ct1, CT2=ranges_in_ct2, CT3=ranges_in_ct3,
idcol="input_column"
)

# Recombine the chr, start, and end columns to the original format:
overlaps[, genomic_window := paste0(chr, ":", start, "-", end)]
overlaps[, c("chr", "start", "end") := NULL]

# Convert to the wide format, so that each input column either has a
# 1 or 0 if the genomic window overlaps with 50% any other found in
# that column
overlaps <- dcast(overlaps, genomic_window ~ input_column,
fun.aggregate = length)

# Convert back to a data.frame:
overlaps <- as.data.frame(overlaps)
rownames(overlaps) <- overlaps$genomic_window
overlaps <- overlaps[,-1]

# Reorder so we can double check against the desired output:
overlaps <- overlaps[rownames(output),]
print(overlaps)

This will generate (almost) the same output you've provided:

              CT1 CT2 CT3
chr1:200-400 1 1 0
chr1:800-970 1 0 0
chr2:300-700 1 1 0
chr1:250-450 1 1 0
chr2:200-500 1 1 0
chr2:600-1000 0 1 1
chr1:400-800 0 0 1
chr1:700-870 0 0 1
chr2:700-1400 0 1 1

The only difference is that chr1:700-870 has a 0 in the CT2 column: this is because it actually doesn't overlap any of the genomic windows in CT2, the only other window on chromosome 1 was chr1:250-450.

Find overlapping ranges in a dataframe and assign them values

A possible solution using the data.table-package:

# load the 'data.table'-package and convert 'input' to a data.table with 'setDT'
library(data.table)
setDT(input)

# reshape 'input' to long format and split the strings in 3 columns
DT <- melt(input, measure.vars = 1:3)[, c('chr','low','high') := tstrsplit(value, split = ':|-', type.convert = TRUE)
, by = variable][]

# create aggregation function; needed in the ast reshape step
f <- function(x) as.integer(length(x) > 0)

# cartesian self join & reshape result back to wide format with aggregation function
DT[DT, on = .(chr, low < high, high > low), allow.cartesian = TRUE
][, dcast(.SD, value ~ i.variable, fun = f)]

which gives:

           value CT1 CT2 CT3
1: chr1:200-400 1 1 0
2: chr1:250-450 1 1 1
3: chr1:400-800 0 1 1
4: chr1:700-870 1 0 1
5: chr1:800-970 1 0 1
6: chr2:200-500 1 1 0
7: chr2:300-700 1 1 0
8: chr2:600-1000 1 1 1
9: chr2:700-1400 0 1 1

R obtain matrix with overlap in ranges

Base R

out <- 100 * with(df, t((outer(stop, stop, pmin) - outer(start, start, pmax)) / (stop - start)))
dimnames(out) <- list(df$label, df$label)
out
# A B C
# A 100.00000 76.47059 100
# B 74.28571 100.00000 100
# C 20.00000 20.58824 100

tidyverse

library(dplyr)
library(tidyr)
expand_grid(Var1 = df$label, Var2 = df$label) %>%
left_join(df, by = c("Var1" = "label")) %>%
left_join(df, by = c("Var2" = "label")) %>%
mutate(
start = pmax(start.y, start.x),
stop = pmin(stop.x, stop.y),
overlap = 100 * (stop - start) / (stop.y - start.y)
) %>%
pivot_wider(Var1, names_from = Var2, values_from = overlap)
# # A tibble: 3 x 4
# Var1 A B C
# <chr> <dbl> <dbl> <dbl>
# 1 A 100 76.5 100
# 2 B 74.3 100 100
# 3 C 20 20.6 100

Aggregate column values of rows that have overlapping range of dates

We can group_by Person and do sum of total_amount which lies in between date1 and date2.

library(dplyr)

df %>%
mutate_at(vars(starts_with("date")), as.Date) %>%
group_by(person) %>%
mutate(overlap = purrr::map2_dbl(date1, date2,
~sum(total_amount[between(date1, .x, .y) | between(date2, .x, .y)])))

# person date1 date2 total_amount overlap
# <fct> <date> <date> <int> <dbl>
#1 A 2019-03-01 2019-03-16 50 150
#2 A 2019-03-10 2019-03-31 100 220
#3 A 2019-03-20 2019-03-31 70 170
#4 B 2019-03-01 2019-03-12 200 330
#5 B 2019-03-01 2019-03-20 130 430
#6 B 2019-03-16 2019-03-31 100 230

data

df <- structure(list(person = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("A", 
"B"), class = "factor"), date1 = structure(c(1L, 2L, 4L, 1L,
1L, 3L), .Label = c("2019-03-01", "2019-03-10", "2019-03-16",
"2019-03-20"), class = "factor"), date2 = structure(c(2L, 4L,
4L, 1L, 3L, 4L), .Label = c("2019-03-12", "2019-03-16", "2019-03-20",
"2019-03-31"), class = "factor"), total_amount = c(50L, 100L,
70L, 200L, 130L, 100L)), class = "data.frame", row.names = c(NA, -6L))

comparing and finding overlap range in R

You may also try foverlaps from data.table

library(data.table)
setkey(setDT(df1), start1, end1)
setkey(setDT(df2), start2, end2)
df1[,overlap:=foverlaps(df1, df2, which=TRUE)[, !is.na(yid),]+0]
df1
# start1 end1 overlap
#1: 1 6 1
#2: 6 8 0
#3: 9 12 1
#4: 13 15 1
#5: 15 19 1
#6: 19 20 0

How to remove overlap in numeric ranges (AWK)

Ok, since OP confirmed that the B 501 540 is typo, I post my answer :)

awk -v OFS="\t" '/^A/{s[NR]=$2;e[NR]=$3;l=NR}
/^B/{
for(i=1;i<=l;i++){
if(s[i]==$2){
s[i]=$3+1
break
}else if(e[i]==$3){
e[i]=$2-1
break
}
}
s[NR] = $2; e[NR]=$3
}
END{for(i=1;i<=NR;i++)print ((i<=l)?"A":"B"),s[i],e[i]}
' file

test with your file (the typo was fixed):

kent$  awk -v OFS="\t" '/^A/{s[NR]=$2;e[NR]=$3;l=NR}
/^B/{
for(i=1;i<=l;i++){
if(s[i]==$2){
s[i]=$3+1
break
}else if(e[i]==$3){
e[i]=$2-1
break
}
}
s[NR] = $2; e[NR]=$3
}
END{for(i=1;i<=NR;i++)print ((i<=l)?"A":"B"),s[i],e[i]}
' file
A 33 100
A 101 160
A 200 300
A 541 1100
A 1200 1249
A 1301 1318
A 1810 1919
B 0 32
B 500 540
B 1250 1300
B 1319 1340
B 1920 2000

EDIT for 6 columns:

dirty and quick, pls check the below example:

file:

kent$  cat file
A 0 100 1 2 3
A 101 160 4 5 6
A 200 300 7 8 9
A 500 1100 10 11 12
A 1200 1300 13 14 15
A 1301 1340 16 17 18
A 1810 2000 19 20 21
B 0 32 22 23 24
B 500 540 22 23 24
B 1250 1300 22 23 24
B 1319 1340 22 23 24
B 1920 2000 22 23 24

awk :

kent$  awk -v OFS="\t" '{s[NR]=$2;e[NR]=$3}
/^A/{l=NR}
/^B/{
for(i=1;i<=l;i++){
if(s[i]==$2){
s[i]=$3+1
break
}else if(e[i]==$3){
e[i]=$2-1
break
}
}
}
{r[NR]=$4OFS$5OFS$6}
END{for(i=1;i<=NR;i++)print ((i<=l)?"A":"B"),s[i],e[i],r[i]} ' file
A 33 100 1 2 3
A 101 160 4 5 6
A 200 300 7 8 9
A 541 1100 10 11 12
A 1200 1249 13 14 15
A 1301 1318 16 17 18
A 1810 1919 19 20 21
B 0 32 22 23 24
B 500 540 22 23 24
B 1250 1300 22 23 24
B 1319 1340 22 23 24
B 1920 2000 22 23 24

awk to find overlaps

Using Gnu Awk version 4, you could try:

gawk -f a.awk file file

where a.awk is:

NR==FNR {
if (FNR>1) {
a[$1][++i]=$2
b[$1][i]=$3
}
next
}
FNR==1 {
fmt="%-7s%-10s%-10s%-10s%-10s\n"
printf fmt,"Group","Start","End","NewStart","NewEnd"
}
FNR>1{
$4=$2; $5=$3
n=checkInside($1,$2,$3)
if (n>0) {
ff=0; x=$2; y=$3
for (i=1; i<=n; i++) {
ar=a[$1][R[i]]; br=b[$1][R[i]];
getIntersect($2,$3,ar,br)
getLargest($2,$3,ar,br)
ovl=((i2-i1)/($3-$2))*100;
ovr=((i2-i1)/(br-ar))*100;
if (ovl>50 && ovr>50) {
if (r1<x) x=r1
if (r2>y) y=r2
ff=1
}
}
if (ff) {
$4=x; $5=y
}
}
printf fmt,$1,$2,$3,$4,$5
}

function getLargest(x1,y1,x2,y2) {
r1=(x1<=x2)?x1:x2
r2=(y1>=y2)?y1:y2
}

function getIntersect(x1,y1,x2,y2) {
if (x1>=x2 && x1<=y2) {
i1=x1;
} else {
i1=x2;
}
i2=(y1<=y2)?y1:y2
}

function checkInside(g,x,y,i,j,x1,y1) {
R["x"]=0
for (i in a[g]) {
x1=a[g][i]; y1=b[g][i];
if ((x>=x1 && x<=y1) || (y>=x1 && y<=y1)) {
if (!(x==x1 && y==y1))
R[++j]=i
}
}
return j
}

Output:

Group  Start     End       NewStart  NewEnd    
chr1 117132092 118875009 117027758 119458215
chr1 117027758 119458215 117027758 119458215
chr1 103756473 104864582 103354114 104864582
chr1 105093795 106219211 105093795 106219211
chr1 103354114 104747251 102741437 105235140
chr1 102741437 105235140 102741437 105235140
chr1 100090254 101094139 100090254 101614730
chr1 100426977 101614730 100090254 101614730
chr2 86644663 87767193 86644663 87767193
chr2 82473711 83636545 82473711 83636545
chr2 83896702 85079032 83876122 85091910
chr2 83876122 85091910 83876122 85091910
chr2 82943211 84350917 82943211 84350917
chr3 89410051 90485635 89405753 90485635
chr3 89405753 90485635 89405753 90485635
chr3 86491492 87593215 86491492 87593215
chr3 82507157 83738004 82507157 83738004
chr3 85059618 86362254 85059618 86362254

Is there any way to do clustering or sorting of file according to two numeric range value columns?

Using a dedicated package - GenomicRanges. Notice: output ranges and widths are not as expected, as we are collapsing connected/overlapping regions into one, but the result is 4 rows as expected:

library(GenomicRanges)

#convert to Granges object
g <- makeGRangesFromDataFrame(df, seqnames.field = "name")

#reduce overlapping regions
gReduce <- reduce(g)

#if needed convert back to dataframe.
as.data.frame(gReduce)
# seqnames start end width strand
# 1 chr1 8480001 8480500 500 *
# 2 chr1 10006251 10006750 500 *
# 3 chr1 13910501 13910750 250 *
# 4 chr1 14841751 14842000 250 *


Related Topics



Leave a reply



Submit