Processing The Input File Based on Range Overlap

Processing the input file based on range overlap

To do this requires many steps, and a number of concepts from the data.table package, most notably non-equi joins. I've commented the code throughout, and recommend running it step by step if you want more insight:

library(data.table)

input <- structure(list(CT1 = structure(1:3, .Label = 
  c("chr1:200-400", "chr1:800-970", "chr2:300-700"), class = 
  "factor"), CT2 = structure(1:3, .Label = c("chr1:250-450", 
  "chr2:200-500", "chr2:600-1000"), class = "factor"), CT3 = 
  structure(1:3, .Label = c("chr1:400-800", "chr1:700-870", 
  "chr2:700-1400"), class = "factor")), .Names = c("CT1", 
  "CT2", "CT3"), class = "data.frame", row.names = c(NA, -3L))

output <- structure(list(CT1 = c(1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L),
  CT2 = c(1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L), CT3 = c(0L, 0L, 0L, 
  0L, 0L, 1L, 1L, 1L, 1L)), .Names = c("CT1", "CT2", "CT3"), class = 
  "data.frame", row.names = c("chr1:200-400", "chr1:800-970", 
  "chr2:300-700", "chr1:250-450", "chr2:200-500", "chr2:600-1000",
  "chr1:400-800", "chr1:700-870", "chr2:700-1400"))

# Builds a data.table by breaking a string like "chr1:300-700" into 
# three columns: chr, start, and end.
split_genomic_range <- function(str) {
  chr <- gsub(":.*", "", str)
  start <- gsub("-.*", "", gsub(".*:", "", str))
  end <- gsub(".*-", "", str)

  start <- as.numeric(start)
  end <- as.numeric(end)

  return(data.table(chr=chr, start=start, end=end))
}

# First break the input data.table into three new tables - we will need
# to perform non-equi joins of the index table (column CT1 in input) to
# the tables built from the other two columns.
ct1 <- split_genomic_range(input$CT1)
ct2 <- split_genomic_range(input$CT2)
ct3 <- split_genomic_range(input$CT3)

# Create an index table with all genomic ranges, then check for 
# overlaps in each of the three tables created from the input
# columns:
index_table <- unique(rbind(ct1, ct2, ct3))

# Returns entries from the index_table if they overlap > 50% any 
# entries in the lookup table or vice-versa
get_overlapping_ranges <- function(index_table, lookup_table) {
  # This function does two non-equi joins. First, it checks whether 
  # any entries in the index_table have a 50% overlap with any 
  # entries in  the lookup table. Second, it does the reverse, and  
  # checks whether any entries in the lookup_table have a 50% overlap 
  # with any entries in the index_table. This is required due to 
  # differing window sizes:
  # e.g. 0-20 significantly overlaps 10-100, but 10-100 does not 
  # significantly overlap 0-20.

  # We will need to create a "middle" column for each genomic range.
  # We will need to create copies of each table to do this, otherwise
  # they will end up with this new column as a side effect of the 
  # function call.
  index_copy <- copy(index_table)
  lookup_copy <- copy(lookup_table)

  index_copy[, middle := start + (end - start) * 0.5]
  lookup_copy[, middle := start + (end - start) * 0.5]

  # In the index_copy we will also need to create dummy columns for
  # the check. We need to do this so we can find the appropriate 
  # genomic window from the index table when we do the second  
  # non-equi join, otherwise the start and end columns will be 
  # clobbered. 
  index_copy[, start_index := start]
  index_copy[, end_index := end]

  # If the middle of a genomic range in the index table falls within 
  # a genomic range in the lookup table, then that genomic entry from 
  # the index table has a significant overlap with the corresponding 
  # in the lookup table
  index_overlaps <- index_copy[lookup_table, 
    on=.(chr, middle >= start, middle <= end),
    nomatch=0]

  # Test the reverse: does any entry in the lookup table 
  # significantly  overlap with any of the genomic windows in the 
  # index table?
  lookup_overlaps <- index_copy[lookup_copy,
    on=.(chr, start_index <= middle, end_index >= middle),
    nomatch=0]

  # Remove extra columns created by the non-equi join:
  index_overlaps <- index_overlaps[,.(chr, start, end)]
  lookup_overlaps <- lookup_overlaps[,.(chr, start, end)]

  # Combine results and remove any duplicates that arise, e.g. 
  # because a genomic window in the index_table significantly 
  # overlaps with multiple genomic windows in the lookup table, or 
  # because the overlap is significant in both comparisons (i.e. 
  # where both windows are the same size).
  overlaps <- unique(rbind(index_overlaps, lookup_overlaps))

  return(overlaps)
}

ranges_in_ct1 <- get_overlapping_ranges(index_table, ct1)
ranges_in_ct2 <- get_overlapping_ranges(index_table, ct2)
ranges_in_ct3 <- get_overlapping_ranges(index_table, ct3)

# Combine results, indicating which column each genomic range was 
# found to overlap:
overlaps <- rbind(
  CT1=ranges_in_ct1, CT2=ranges_in_ct2, CT3=ranges_in_ct3,
  idcol="input_column"
) 

# Recombine the chr, start, and end columns to the original format:
overlaps[, genomic_window := paste0(chr, ":", start, "-", end)]
overlaps[, c("chr", "start", "end") := NULL]

# Convert to the wide format, so that each input column either has a  
# 1 or 0 if the genomic window overlaps with 50% any other found in 
# that column
overlaps <- dcast(overlaps, genomic_window ~ input_column, 
                  fun.aggregate = length)

# Convert back to a data.frame:
overlaps <- as.data.frame(overlaps)
rownames(overlaps) <- overlaps$genomic_window
overlaps <- overlaps[,-1]

# Reorder so we can double check against the desired output:
overlaps <- overlaps[rownames(output),]
print(overlaps)

This will generate (almost) the same output you've provided:

              CT1 CT2 CT3
chr1:200-400    1   1   0
chr1:800-970    1   0   0
chr2:300-700    1   1   0
chr1:250-450    1   1   0
chr2:200-500    1   1   0
chr2:600-1000   0   1   1
chr1:400-800    0   0   1
chr1:700-870    0   0   1
chr2:700-1400   0   1   1

The only difference is that chr1:700-870 has a 0 in the CT2 column: this is because it actually doesn't overlap any of the genomic windows in CT2, the only other window on chromosome 1 was chr1:250-450.

Find overlapping ranges in a dataframe and assign them values

A possible solution using the data.table-package:

# load the 'data.table'-package and convert 'input' to a data.table with 'setDT'
library(data.table)
setDT(input)

# reshape 'input' to long format and split the strings in 3 columns
DT <- melt(input, measure.vars = 1:3)[, c('chr','low','high') := tstrsplit(value, split = ':|-', type.convert = TRUE)
                                      , by = variable][]

# create aggregation function; needed in the ast reshape step
f <- function(x) as.integer(length(x) > 0)

# cartesian self join & reshape result back to wide format with aggregation function
DT[DT, on = .(chr, low < high, high > low), allow.cartesian = TRUE
   ][, dcast(.SD, value ~ i.variable, fun = f)]

which gives:

           value CT1 CT2 CT3
1:  chr1:200-400   1   1   0
2:  chr1:250-450   1   1   1
3:  chr1:400-800   0   1   1
4:  chr1:700-870   1   0   1
5:  chr1:800-970   1   0   1
6:  chr2:200-500   1   1   0
7:  chr2:300-700   1   1   0
8: chr2:600-1000   1   1   1
9: chr2:700-1400   0   1   1

R obtain matrix with overlap in ranges

Base R

out <- 100 * with(df, t((outer(stop, stop, pmin) - outer(start, start, pmax)) / (stop - start)))
dimnames(out) <- list(df$label, df$label)
out
#           A         B   C
# A 100.00000  76.47059 100
# B  74.28571 100.00000 100
# C  20.00000  20.58824 100

tidyverse

library(dplyr)
library(tidyr)
expand_grid(Var1 = df$label, Var2 = df$label) %>%
  left_join(df, by = c("Var1" = "label")) %>%
  left_join(df, by = c("Var2" = "label")) %>%
  mutate(
    start = pmax(start.y, start.x),
    stop  = pmin(stop.x, stop.y),
    overlap = 100 * (stop - start) / (stop.y - start.y)
  ) %>%
  pivot_wider(Var1, names_from = Var2, values_from = overlap)
# # A tibble: 3 x 4
#   Var1      A     B     C
#   <chr> <dbl> <dbl> <dbl>
# 1 A     100    76.5   100
# 2 B      74.3 100     100
# 3 C      20    20.6   100

Aggregate column values of rows that have overlapping range of dates

We can group_by Person and do sum of total_amount which lies in between date1 and date2.

library(dplyr)

df %>%
  mutate_at(vars(starts_with("date")),  as.Date) %>%
  group_by(person) %>%
  mutate(overlap = purrr::map2_dbl(date1, date2, 
             ~sum(total_amount[between(date1, .x, .y) | between(date2, .x, .y)])))

#  person date1      date2      total_amount overlap
#  <fct>  <date>     <date>            <int>   <dbl>
#1 A      2019-03-01 2019-03-16           50     150
#2 A      2019-03-10 2019-03-31          100     220
#3 A      2019-03-20 2019-03-31           70     170
#4 B      2019-03-01 2019-03-12          200     330
#5 B      2019-03-01 2019-03-20          130     430
#6 B      2019-03-16 2019-03-31          100     230

data

df <- structure(list(person = structure(c(1L, 1L, 1L, 2L, 2L, 2L), .Label = c("A", 
"B"), class = "factor"), date1 = structure(c(1L, 2L, 4L, 1L, 
1L, 3L), .Label = c("2019-03-01", "2019-03-10", "2019-03-16", 
"2019-03-20"), class = "factor"), date2 = structure(c(2L, 4L, 
4L, 1L, 3L, 4L), .Label = c("2019-03-12", "2019-03-16", "2019-03-20", 
"2019-03-31"), class = "factor"), total_amount = c(50L, 100L, 
70L, 200L, 130L, 100L)), class = "data.frame", row.names = c(NA, -6L))

comparing and finding overlap range in R

You may also try foverlaps from data.table

library(data.table)
setkey(setDT(df1), start1, end1)
setkey(setDT(df2), start2, end2)
df1[,overlap:=foverlaps(df1, df2, which=TRUE)[, !is.na(yid),]+0]
df1
#   start1 end1 overlap
#1:      1    6       1
#2:      6    8       0
#3:      9   12       1
#4:     13   15       1
#5:     15   19       1
#6:     19   20       0

How to remove overlap in numeric ranges (AWK)

Ok, since OP confirmed that the B 501 540 is typo, I post my answer :)

awk -v OFS="\t" '/^A/{s[NR]=$2;e[NR]=$3;l=NR}
/^B/{ 
        for(i=1;i<=l;i++){
                if(s[i]==$2){
                        s[i]=$3+1
                        break
                }else if(e[i]==$3){
                        e[i]=$2-1
                        break
                }
        }
        s[NR] = $2; e[NR]=$3
}
END{for(i=1;i<=NR;i++)print ((i<=l)?"A":"B"),s[i],e[i]}
        ' file

test with your file (the typo was fixed):

kent$  awk -v OFS="\t" '/^A/{s[NR]=$2;e[NR]=$3;l=NR}
/^B/{ 
        for(i=1;i<=l;i++){
                if(s[i]==$2){
                        s[i]=$3+1
                        break
                }else if(e[i]==$3){
                        e[i]=$2-1
                        break
                }
        }
        s[NR] = $2; e[NR]=$3
}
END{for(i=1;i<=NR;i++)print ((i<=l)?"A":"B"),s[i],e[i]}
        ' file
    A       33      100
    A       101     160
    A       200     300
    A       541     1100
    A       1200    1249
    A       1301    1318
    A       1810    1919
    B       0       32
    B       500     540
    B       1250    1300
    B       1319    1340
    B       1920    2000

EDIT for 6 columns:

dirty and quick, pls check the below example:

file:

kent$  cat file
A   0       100 1 2 3
A   101     160 4 5 6
A   200     300 7 8 9
A   500     1100 10 11 12
A   1200    1300 13 14 15
A   1301    1340 16 17 18
A   1810    2000 19 20 21
B   0       32  22 23 24
B   500     540 22 23 24
B   1250    1300 22 23 24
B   1319    1340 22 23 24
B   1920    2000 22 23 24

awk :

kent$  awk -v OFS="\t" '{s[NR]=$2;e[NR]=$3}
/^A/{l=NR}
/^B/{ 
        for(i=1;i<=l;i++){
                if(s[i]==$2){
                        s[i]=$3+1
                        break
                }else if(e[i]==$3){
                        e[i]=$2-1
                        break
                }
        }
}
{r[NR]=$4OFS$5OFS$6}
END{for(i=1;i<=NR;i++)print ((i<=l)?"A":"B"),s[i],e[i],r[i]} ' file
A       33      100     1       2       3
A       101     160     4       5       6
A       200     300     7       8       9
A       541     1100    10      11      12
A       1200    1249    13      14      15
A       1301    1318    16      17      18
A       1810    1919    19      20      21
B       0       32      22      23      24
B       500     540     22      23      24
B       1250    1300    22      23      24
B       1319    1340    22      23      24
B       1920    2000    22      23      24

awk to find overlaps

Using Gnu Awk version 4, you could try:

gawk -f a.awk file file

where a.awk is:

NR==FNR {
    if (FNR>1) {
        a[$1][++i]=$2
        b[$1][i]=$3
    }
    next
}
FNR==1 {
    fmt="%-7s%-10s%-10s%-10s%-10s\n"
    printf fmt,"Group","Start","End","NewStart","NewEnd" 
}
FNR>1{
    $4=$2; $5=$3
    n=checkInside($1,$2,$3)
    if (n>0) {
        ff=0; x=$2; y=$3
        for (i=1; i<=n; i++) {
            ar=a[$1][R[i]]; br=b[$1][R[i]];
            getIntersect($2,$3,ar,br)
            getLargest($2,$3,ar,br)
            ovl=((i2-i1)/($3-$2))*100;
            ovr=((i2-i1)/(br-ar))*100;
            if (ovl>50 && ovr>50) {
                if (r1<x) x=r1
                if (r2>y) y=r2
                ff=1
            }
        }
        if (ff) {
            $4=x; $5=y
        }
    }
    printf fmt,$1,$2,$3,$4,$5
}

function getLargest(x1,y1,x2,y2) {
    r1=(x1<=x2)?x1:x2
    r2=(y1>=y2)?y1:y2
}

function getIntersect(x1,y1,x2,y2) {
    if (x1>=x2 && x1<=y2) {
        i1=x1;
    } else {
        i1=x2;
    }
    i2=(y1<=y2)?y1:y2
}

function checkInside(g,x,y,i,j,x1,y1) {
    R["x"]=0
    for (i in a[g]) {
        x1=a[g][i]; y1=b[g][i];
        if ((x>=x1 && x<=y1) || (y>=x1 && y<=y1)) {
            if (!(x==x1 && y==y1))
                R[++j]=i
        }
    }
    return j
}

Output:

Group  Start     End       NewStart  NewEnd    
chr1   117132092 118875009 117027758 119458215 
chr1   117027758 119458215 117027758 119458215 
chr1   103756473 104864582 103354114 104864582 
chr1   105093795 106219211 105093795 106219211 
chr1   103354114 104747251 102741437 105235140 
chr1   102741437 105235140 102741437 105235140 
chr1   100090254 101094139 100090254 101614730 
chr1   100426977 101614730 100090254 101614730 
chr2   86644663  87767193  86644663  87767193  
chr2   82473711  83636545  82473711  83636545  
chr2   83896702  85079032  83876122  85091910  
chr2   83876122  85091910  83876122  85091910  
chr2   82943211  84350917  82943211  84350917  
chr3   89410051  90485635  89405753  90485635  
chr3   89405753  90485635  89405753  90485635  
chr3   86491492  87593215  86491492  87593215  
chr3   82507157  83738004  82507157  83738004  
chr3   85059618  86362254  85059618  86362254

Is there any way to do clustering or sorting of file according to two numeric range value columns?

Using a dedicated package - GenomicRanges. Notice: output ranges and widths are not as expected, as we are collapsing connected/overlapping regions into one, but the result is 4 rows as expected:

library(GenomicRanges)

#convert to Granges object
g <- makeGRangesFromDataFrame(df, seqnames.field = "name")

#reduce overlapping regions
gReduce <- reduce(g)

#if needed convert back to dataframe.
as.data.frame(gReduce)
#     seqnames    start      end width strand
#   1     chr1  8480001  8480500   500      *
#   2     chr1 10006251 10006750   500      *
#   3     chr1 13910501 13910750   250      *
#   4     chr1 14841751 14842000   250      *

Processing The Input File Based on Range Overlap