Merge Two Dataframes If Timestamp of X Is Within Time Interval of Y

Merge two dataframes if timestamp of x is within time interval of y

You could use sqldf:

library(sqldf)
df<-sqldf('select d1.*,d2.*
from DF1 d1
left join DF2 d2
on d1.Timestamp >= d2.onebef
AND d1.Timestamp <= d2.oneaft
')
df

Merging two dataframes with different timestamp (different time interval)

Let's try pd.cut:

lower_bounds = pd.cut(df2['time'], 
bins=list(df1['time']) + [pd.to_datetime('2050-01-01')],
right=False, include_lowest=True,
labels=df1['time'])

df1['ratings'] = (df2.groupby(lower_bounds)
['rating'].mean()
.reindex(df1['time'])
.values
)

Or you can use merge_asof:

df1['ratings'] = pd.merge_asof(df2, df1.reset_index(),
on='time',
direction='backward'
).groupby('index')['rating'].mean()

Merge two dataframes with different time ranges by repeating values within time range in R

You can use a rolling join

library(data.table)
setDT(df1)
setDT(df2)

df2[df1, on = .(time), roll = TRUE]

# time Temp.ar Ur.ar Vel.Vento x y z
# 1: 2019-12-11 06:08:03 14.79 78.5 1.147 -52 -39 -35
# 2: 2019-12-11 06:08:08 14.79 78.5 1.147 -47 -57 -36
# 3: 2019-12-11 06:08:13 14.79 78.5 1.147 -39 2 -40
# 4: 2019-12-11 06:10:20 14.74 78.9 1.045 -45 -23 -29
# 5: 2019-12-11 06:10:29 14.74 78.9 1.045 -51 -11 -31
# 6: 2019-12-11 06:20:34 14.90 78.9 1.009 -69 -28 -19

Data used

df1 <- fread('
x y z time
-52 -39 -35 06:08:03
-47 -57 -36 06:08:08
-39 2 -40 06:08:13
-45 -23 -29 06:10:20
-51 -11 -31 06:10:29
-69 -28 -19 06:20:34
')

df2 <- fread('
time Temp.ar Ur.ar Vel.Vento
06:00:00 14.79 78.5 1.147
06:10:00 14.74 78.9 1.045
06:20:00 14.9 78.9 1.009
06:30:00 15.14 78.6 1.076
06:40:00 15.32 77.8 1.332
06:50:00 15.6 76.5 1.216
')

Merge two dataframes based on similar time interval in R

You could use foverlaps from data.table package

library(data.table)

# convert to data.table
setDT(df1)
setDT(df2)

# Character dates to numeric dates
df1[,START:=as.POSIXct(START)]
df1[,END:=as.POSIXct(END)]
df2[,start_time:=as.POSIXct(start_time)]
df2[,endtime:=as.POSIXct(endtime)]

# key needed for second data.table
setkey(df2,id,start_time,endtime)

# Overlap join
foverlaps(df1,df2,by.x = c("id","START","END"))[
,.(id,STREAM_1,STREAM_2,STREAM_3=code,START,END)][
,.(STREAM_3=first(STREAM_3)),by=.(id,STREAM_1,STREAM_2,START,END)]

id STREAM_1 STREAM_2 START END STREAM_3
<int> <char> <char> <POSc> <POSc> <char>
1: 401 NIVH-OFF IN 2022-08-16 02:00:00 2022-08-16 03:35:00 <NA>
2: 401 NIVH-OFF IN 2022-08-16 03:35:00 2022-08-16 07:22:45 <NA>
3: 401 NIVH-ON IN 2022-08-16 07:22:45 2022-08-16 07:31:45 <NA>
4: 401 NIVH-DOCK IN 2022-08-16 07:31:45 2022-08-16 07:43:30 <NA>
5: 401 NIVH-ON IN 2022-08-16 07:43:30 2022-08-16 07:45:00 <NA>
6: 401 INVH-ON IN 2022-08-16 07:45:00 2022-08-16 07:49:00 12be4
7: 401 INVH-ON OUT 2022-08-16 07:49:00 2022-08-16 08:50:00 12be4
8: 401 INVH-ON IN 2022-08-16 08:50:00 2022-08-16 08:56:00 <NA>
9: 401 NIVH-ON IN 2022-08-16 08:56:00 2022-08-16 08:58:00 <NA>
10: 401 INVH-ON IN 2022-08-16 08:58:00 2022-08-16 09:00:00 <NA>
...

Merge dataframes based on interval condition

You can use sqldf for complex joins:


require(sqldf)

sqldf("SELECT df1.*,df2.date,df2.id as id2
FROM df1
LEFT JOIN df2
ON df1.id = df2.id AND
df1.start < df2.date AND
df1.end > df2.date")

Join two dataframes with several columns in R based on closest timestamp

Enter the world of data.table's rolling joins

sample data

#or use
# setDT(df1); setDT(df2)
#to convert existing data.frame df1 and df2 to data.table

library( data.table)
df1 <- data.table::fread("Timestamp Var1 Var2
01-01-20T10:47 7 8
01-01-20T11:50 6 4")

df2 <- data.table::fread("Timestamp Var851 Var852
01-01-20T10:55 4 1
01-01-20T12:08 3 4")

#timestamps/dates have to be of posix- or date-class to be able
#to roll-join them
df1[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]
df2[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]

code

df2[df1, roll = "nearest", on = .(Timestamp)]

# Timestamp Var851 Var852 Var1 Var2
# 1: 2020-01-01 10:47:00 4 1 7 8
# 2: 2020-01-01 11:50:00 3 4 6 4

Map two dataframes, count events where timestamps in second dataframe are within the date-time ranges of the first dataframe

Here is one way to do it (I've left intermediate print-outs to help better understand what's happening at each step):

# Setup
df["start_ts"] = pd.to_datetime(df["start_ts"], format="%Y-%m-%d %H:%M:%s")
df["end_ts"] = pd.to_datetime(df["end_ts"], format="%Y-%m-%d %H:%M:%s")
df2["timestamp"] = pd.to_datetime(df2["timestamp"], format="%Y-%m-%d %H:%M:%s")
# Find matching intervals
for idx in df["person_id"].unique():
df2.loc[df2["person_id"] == idx, "interval"] = df2.loc[
df2["person_id"] == idx, "timestamp"
].map(
lambda x: df.loc[
(df["person_id"] == idx) & (df["start_ts"] <= x) & (x <= df["end_ts"]),
["start_ts", "end_ts"],
].index[0]
)

print(df2.head())
# Output
person_id timestamp event interval
0 A 2022-05-05 01:00:00 1 0.0
1 A 2022-05-05 01:10:00 2 0.0
2 A 2022-05-05 01:30:00 3 0.0
3 A 2022-05-05 06:00:00 1 2.0
4 A 2022-05-05 07:00:00 2 2.0
# Count number of events
df2 = (
df2.assign(interval=lambda x: x["interval"].astype(int))
.assign(count=1)
.groupby(["person_id", "interval", "event"])
.agg({"count": sum})
.reset_index(drop=False)
.pivot(index=["person_id", "interval"], columns="event")
.reset_index(drop=False)
)
df2.columns = df2.columns.droplevel()
df2.columns = [
"person_id",
"interval",
"count_event_1",
"count_event_2",
"count_event_3",
]

print(df2)
# Output
person_id interval 1 2 3
0 A 0 1.0 1.0 1.0
1 A 2 1.0 2.0 NaN
2 A 4 2.0 NaN 1.0
3 B 1 NaN 1.0 2.0
4 C 3 2.0 NaN 1.0
5 C 5 1.0 1.0 1.0
# Final dataframe
df = df.reset_index(drop=False).rename(columns={"index": "interval"})
df = (
pd.merge(left=df, right=df2, on=["person_id", "interval"])
.fillna(0)
.astype(int, errors="ignore")
)

print(df)
# Output as expected
interval person_id location_id start_ts end_ts count_event_1 count_event_2 count_event_3
0 0 A 1 2022-05-05 00:00:00 2022-05-05 02:00:00 1 1 1
1 1 B 5 2022-05-05 00:00:00 2022-05-05 03:00:00 0 1 2
2 2 A 2 2022-05-05 05:00:00 2022-05-05 10:00:00 1 2 0
3 3 C 7 2022-05-05 00:00:00 2022-05-05 04:00:00 2 0 1
4 4 A 3 2022-05-05 13:00:00 2022-05-05 16:00:00 2 0 1
5 5 C 8 2022-05-05 11:00:00 2022-05-05 12:00:00 1 1 1

Combine two datasets with Interval time condition in R (I would like to avoid combinations and just have unique matches)

May be, we need to do a crossing and then filter after converting to DateTime class

library(dplyr)
library(tidyr)
library(lubridate)
crossing(endtime = as.POSIXct(df1$endtime,format ="%m/%d/%Y %I:%M:%S %p" ),
sent = as.POSIXct(df2$sent, format = "%m/%d/%Y %I:%M:%S %p")) %>%
filter((endtime - seconds(20)) <= sent,
(endtime + seconds(20)) >= (sent)) %>%
mutate_all(format, format = "%m/%d/%Y %I:%M:%S %p") %>%
distinct
# A tibble: 1 x 2
# endtime sent
# <chr> <chr>
#1 01/07/2020 01:35:08 AM 01/07/2020 01:35:20 AM


Related Topics



Leave a reply



Submit