Merge Overlapping Date Intervals

Merge consecutive and overlapping date ranges

within customer groupby("cust", as_index=False) look for overlapping dates in rows
cumsum() sums booleans to generate a group for overlapping dates
finally simple case on min/max within groups


df.groupby("cust", as_index=False).apply(
    lambda d: d.sort_values(["start_date", "end_date"])
    .groupby(
        ["cust", (~(d["start_date"] <= (d["end_date"].shift() + pd.Timedelta(days=1)))).cumsum()],
        as_index=False
    )
    .agg({"start_date": "min", "end_date": "max"})
).reset_index(drop=True)

	cust	start_date	end_date
0	CUST123	2021-01-01 00:00:00	2021-01-31 00:00:00
1	CUST123	2021-02-02 00:00:00	2021-02-28 00:00:00
2	CUST456	2021-01-05 00:00:00	2021-01-31 00:00:00

Java merge overlapping date intervals

private Set<AppointmentAvailabilityBlock> mergeUnavailabilitiesBlocks(
    Set<AppointmentAvailabilityBlock> appointmentComponentUnavailabilitiesBlock) {

// Transform Set to List
List<AppointmentAvailabilityBlock> intervals = new LinkedList<AppointmentAvailabilityBlock>();
intervals.addAll(appointmentComponentUnavailabilitiesBlock);

// Sort by blockStartTime
intervals.sort(Comparator.comparing(AppointmentAvailabilityBlock::getBlockStartTime));

// Merge
LinkedList<AppointmentAvailabilityBlock> merged = new LinkedList<>();
for (AppointmentAvailabilityBlock interval : intervals) {
    // No overlap with the previous interval, append it.
    if (merged.isEmpty() || merged.getLast().getBlockEndTime().isBefore(interval.getBlockStartTime())) {
        merged.add(interval);
    } else { // There is overlap
        OffsetDateTime maxOffsetDateTime = merged.getLast().getBlockEndTime().isAfter(
                interval.getBlockEndTime()) ? merged.getLast().getBlockEndTime() : interval.getBlockEndTime();
        
        merged.getLast().setBlockEndTime(maxOffsetDateTime);
    }
}
return new HashSet<AppointmentAvailabilityBlock>(merged);
}

Merge Overlapping Intervals in PysPark

You can use window function to compare previous rows with current row, to build a column that determine if current row is the start of a new interval, then sum over this column to build a interval id. Then you group by this interval id to get your final dataframe.

If you call input_df your input dataframe, the code will be as follows:

from pyspark.sql import Window
from pyspark.sql import functions as F

all_previous_rows_window = Window \
  .orderBy('start') \
  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

result = input_df \
  .withColumn('max_previous_end', F.max('end').over(all_previous_rows_window)) \
  .withColumn('interval_change', F.when(
    F.col('start') > F.lag('max_previous_end').over(Window.orderBy('start')), 
    F.lit(1)
  ).otherwise(F.lit(0))) \
  .withColumn('interval_id', F.sum('interval_change').over(all_previous_rows_window)) \
  .drop('interval_change', 'max_previous_end') \
  .groupBy('interval_id') \
  .agg(
    F.collect_list('id').alias('ids'),
    F.min('start').alias('start'),
    F.max('end').alias('end')
  ).drop('interval_id')

So you can merge your intervals without any user-defined function. However, every time we use a window, code is executed on only on one executor, as our windows don't have partitions.

How to flatten / merge overlapping time periods

Here's a possible solution. The basic idea here is to compare lagged start date with the maximum end date "until now" using the cummax function and create an index that will separate the data into groups

data %>%
  arrange(ID, start) %>% # as suggested by @Jonno in case the data is unsorted
  group_by(ID) %>%
  mutate(indx = c(0, cumsum(as.numeric(lead(start)) >
                     cummax(as.numeric(end)))[-n()])) %>%
  group_by(ID, indx) %>%
  summarise(start = first(start), end = last(end))

# Source: local data frame [3 x 4]
# Groups: ID
# 
#   ID indx      start        end
# 1  A    0 2013-01-01 2013-01-06
# 2  A    1 2013-01-07 2013-01-11
# 3  A    2 2013-01-12 2013-01-15

Merge overlapping dates in SQL Server

SQL DEMO

declare @t table (Name varchar(100),  Datetime_Start  datetime,  Datetime_End datetime);
insert into @t values
 ('A'   , '2017-01-02 00:00' , '2017-03-28 00:10'),
 ('A'   , '2017-05-14 23:50' , '2017-05-29 23:50'),
 ('B'   , '2017-05-18 00:00' , '2017-05-18 04:00'),
 ('B'   , '2017-05-18 02:00' , '2017-05-18 03:00'),
 ('C'   , '2017-01-02 00:00' , '2017-01-17 15:50'),
 ('C'   , '2017-01-14 03:50' , '2017-01-28 15:50');

with Datetime_Starts as 
( 
  select distinct name, Datetime_Start 
  from @t as t1 
  where not exists 
    (select * from @t as t2 
     where t2.name = t1.name 
       and t2.Datetime_Start < t1.Datetime_Start 
       and t2.Datetime_End >= t1.Datetime_Start) 
), 
Datetime_Ends as 
( 
  select distinct name, Datetime_End 
  from @t as t1 
  where not exists 
    (select * from @t as t2 
     where t2.name = t1.name 
       and t2.Datetime_End > t1.Datetime_End 
       and t2.Datetime_Start <= t1.Datetime_End) 
) 

select name, Datetime_Start, 
      (select min(Datetime_End) 
        from Datetime_Ends as e 
        where e.name = s.name 
            and Datetime_End >= Datetime_Start) as Datetime_End 
    from Datetime_Starts as s;