Merge Overlapping Date Intervals

Merge consecutive and overlapping date ranges


  • within customer groupby("cust", as_index=False) look for overlapping dates in rows
  • cumsum() sums booleans to generate a group for overlapping dates
  • finally simple case on min/max within groups

df.groupby("cust", as_index=False).apply(
lambda d: d.sort_values(["start_date", "end_date"])
.groupby(
["cust", (~(d["start_date"] <= (d["end_date"].shift() + pd.Timedelta(days=1)))).cumsum()],
as_index=False
)
.agg({"start_date": "min", "end_date": "max"})
).reset_index(drop=True)






























custstart_dateend_date
0CUST1232021-01-01 00:00:002021-01-31 00:00:00
1CUST1232021-02-02 00:00:002021-02-28 00:00:00
2CUST4562021-01-05 00:00:002021-01-31 00:00:00

Java merge overlapping date intervals


private Set<AppointmentAvailabilityBlock> mergeUnavailabilitiesBlocks(
Set<AppointmentAvailabilityBlock> appointmentComponentUnavailabilitiesBlock) {

// Transform Set to List
List<AppointmentAvailabilityBlock> intervals = new LinkedList<AppointmentAvailabilityBlock>();
intervals.addAll(appointmentComponentUnavailabilitiesBlock);

// Sort by blockStartTime
intervals.sort(Comparator.comparing(AppointmentAvailabilityBlock::getBlockStartTime));

// Merge
LinkedList<AppointmentAvailabilityBlock> merged = new LinkedList<>();
for (AppointmentAvailabilityBlock interval : intervals) {
// No overlap with the previous interval, append it.
if (merged.isEmpty() || merged.getLast().getBlockEndTime().isBefore(interval.getBlockStartTime())) {
merged.add(interval);
} else { // There is overlap
OffsetDateTime maxOffsetDateTime = merged.getLast().getBlockEndTime().isAfter(
interval.getBlockEndTime()) ? merged.getLast().getBlockEndTime() : interval.getBlockEndTime();

merged.getLast().setBlockEndTime(maxOffsetDateTime);
}
}
return new HashSet<AppointmentAvailabilityBlock>(merged);
}

Merge Overlapping Intervals in PysPark

You can use window function to compare previous rows with current row, to build a column that determine if current row is the start of a new interval, then sum over this column to build a interval id. Then you group by this interval id to get your final dataframe.

If you call input_df your input dataframe, the code will be as follows:

from pyspark.sql import Window
from pyspark.sql import functions as F

all_previous_rows_window = Window \
.orderBy('start') \
.rowsBetween(Window.unboundedPreceding, Window.currentRow)

result = input_df \
.withColumn('max_previous_end', F.max('end').over(all_previous_rows_window)) \
.withColumn('interval_change', F.when(
F.col('start') > F.lag('max_previous_end').over(Window.orderBy('start')),
F.lit(1)
).otherwise(F.lit(0))) \
.withColumn('interval_id', F.sum('interval_change').over(all_previous_rows_window)) \
.drop('interval_change', 'max_previous_end') \
.groupBy('interval_id') \
.agg(
F.collect_list('id').alias('ids'),
F.min('start').alias('start'),
F.max('end').alias('end')
).drop('interval_id')

So you can merge your intervals without any user-defined function. However, every time we use a window, code is executed on only on one executor, as our windows don't have partitions.

How to flatten / merge overlapping time periods

Here's a possible solution. The basic idea here is to compare lagged start date with the maximum end date "until now" using the cummax function and create an index that will separate the data into groups

data %>%
arrange(ID, start) %>% # as suggested by @Jonno in case the data is unsorted
group_by(ID) %>%
mutate(indx = c(0, cumsum(as.numeric(lead(start)) >
cummax(as.numeric(end)))[-n()])) %>%
group_by(ID, indx) %>%
summarise(start = first(start), end = last(end))

# Source: local data frame [3 x 4]
# Groups: ID
#
# ID indx start end
# 1 A 0 2013-01-01 2013-01-06
# 2 A 1 2013-01-07 2013-01-11
# 3 A 2 2013-01-12 2013-01-15

Merge overlapping dates in SQL Server

SQL DEMO

declare @t table (Name varchar(100),  Datetime_Start  datetime,  Datetime_End datetime);
insert into @t values
('A' , '2017-01-02 00:00' , '2017-03-28 00:10'),
('A' , '2017-05-14 23:50' , '2017-05-29 23:50'),
('B' , '2017-05-18 00:00' , '2017-05-18 04:00'),
('B' , '2017-05-18 02:00' , '2017-05-18 03:00'),
('C' , '2017-01-02 00:00' , '2017-01-17 15:50'),
('C' , '2017-01-14 03:50' , '2017-01-28 15:50');

with Datetime_Starts as
(
select distinct name, Datetime_Start
from @t as t1
where not exists
(select * from @t as t2
where t2.name = t1.name
and t2.Datetime_Start < t1.Datetime_Start
and t2.Datetime_End >= t1.Datetime_Start)
),
Datetime_Ends as
(
select distinct name, Datetime_End
from @t as t1
where not exists
(select * from @t as t2
where t2.name = t1.name
and t2.Datetime_End > t1.Datetime_End
and t2.Datetime_Start <= t1.Datetime_End)
)

select name, Datetime_Start,
(select min(Datetime_End)
from Datetime_Ends as e
where e.name = s.name
and Datetime_End >= Datetime_Start) as Datetime_End
from Datetime_Starts as s;


Related Topics



Leave a reply



Submit