Merge Two Dataframes If Timestamp of X Is Within Time Interval of Y

Merge two dataframes if timestamp of x is within time interval of y

You could use sqldf:

library(sqldf)
df<-sqldf('select d1.*,d2.*
           from DF1 d1
           left join DF2 d2
             on d1.Timestamp >= d2.onebef
               AND d1.Timestamp <= d2.oneaft
          ')
df

Merging two dataframes with different timestamp (different time interval)

Let's try pd.cut:

lower_bounds = pd.cut(df2['time'], 
                      bins=list(df1['time']) + [pd.to_datetime('2050-01-01')],
                      right=False, include_lowest=True,
                      labels=df1['time'])

df1['ratings'] = (df2.groupby(lower_bounds)
                  ['rating'].mean()
                  .reindex(df1['time'])
                  .values
                 )

Or you can use merge_asof:

df1['ratings'] = pd.merge_asof(df2, df1.reset_index(),
                               on='time',
                               direction='backward'
                              ).groupby('index')['rating'].mean()

Merge two dataframes with different time ranges by repeating values within time range in R

You can use a rolling join

library(data.table)
setDT(df1)
setDT(df2)

df2[df1, on = .(time), roll = TRUE]

#                   time Temp.ar Ur.ar Vel.Vento   x   y   z
# 1: 2019-12-11 06:08:03   14.79  78.5     1.147 -52 -39 -35
# 2: 2019-12-11 06:08:08   14.79  78.5     1.147 -47 -57 -36
# 3: 2019-12-11 06:08:13   14.79  78.5     1.147 -39   2 -40
# 4: 2019-12-11 06:10:20   14.74  78.9     1.045 -45 -23 -29
# 5: 2019-12-11 06:10:29   14.74  78.9     1.045 -51 -11 -31
# 6: 2019-12-11 06:20:34   14.90  78.9     1.009 -69 -28 -19

Data used

df1 <- fread('
x   y   z     time
-52 -39 -35 06:08:03
-47 -57 -36 06:08:08
-39   2 -40 06:08:13
-45 -23 -29 06:10:20
-51 -11 -31 06:10:29
-69 -28 -19 06:20:34
')

df2 <- fread('
time        Temp.ar  Ur.ar  Vel.Vento
06:00:00    14.79    78.5   1.147
06:10:00    14.74    78.9   1.045
06:20:00    14.9     78.9   1.009
06:30:00    15.14    78.6   1.076
06:40:00    15.32    77.8   1.332
06:50:00    15.6     76.5   1.216
')

Merge two dataframes based on similar time interval in R

You could use foverlaps from data.table package

library(data.table)

# convert to data.table
setDT(df1)
setDT(df2)

# Character dates to numeric dates
df1[,START:=as.POSIXct(START)]
df1[,END:=as.POSIXct(END)]
df2[,start_time:=as.POSIXct(start_time)]
df2[,endtime:=as.POSIXct(endtime)]

# key needed for second data.table
setkey(df2,id,start_time,endtime)

# Overlap join
foverlaps(df1,df2,by.x = c("id","START","END"))[
    ,.(id,STREAM_1,STREAM_2,STREAM_3=code,START,END)][ 
    ,.(STREAM_3=first(STREAM_3)),by=.(id,STREAM_1,STREAM_2,START,END)]

       id  STREAM_1 STREAM_2               START                 END STREAM_3
    <int>    <char>   <char>              <POSc>              <POSc>   <char>
 1:   401  NIVH-OFF       IN 2022-08-16 02:00:00 2022-08-16 03:35:00     <NA>
 2:   401  NIVH-OFF       IN 2022-08-16 03:35:00 2022-08-16 07:22:45     <NA>
 3:   401   NIVH-ON       IN 2022-08-16 07:22:45 2022-08-16 07:31:45     <NA>
 4:   401 NIVH-DOCK       IN 2022-08-16 07:31:45 2022-08-16 07:43:30     <NA>
 5:   401   NIVH-ON       IN 2022-08-16 07:43:30 2022-08-16 07:45:00     <NA>
 6:   401   INVH-ON       IN 2022-08-16 07:45:00 2022-08-16 07:49:00    12be4
 7:   401   INVH-ON      OUT 2022-08-16 07:49:00 2022-08-16 08:50:00    12be4
 8:   401   INVH-ON       IN 2022-08-16 08:50:00 2022-08-16 08:56:00     <NA>
 9:   401   NIVH-ON       IN 2022-08-16 08:56:00 2022-08-16 08:58:00     <NA>
10:   401   INVH-ON       IN 2022-08-16 08:58:00 2022-08-16 09:00:00     <NA>
...

Merge dataframes based on interval condition

You can use sqldf for complex joins:


require(sqldf)

sqldf("SELECT df1.*,df2.date,df2.id as id2
      FROM df1
      LEFT JOIN df2 
      ON df1.id = df2.id AND
      df1.start < df2.date AND
      df1.end > df2.date")

Join two dataframes with several columns in R based on closest timestamp

Enter the world of data.table's rolling joins

sample data

#or use
#    setDT(df1); setDT(df2)
#to convert existing data.frame df1 and df2 to data.table

library( data.table)
df1 <- data.table::fread("Timestamp       Var1 Var2 
01-01-20T10:47  7    8        
01-01-20T11:50  6    4")        

df2 <- data.table::fread("Timestamp       Var851 Var852
01-01-20T10:55  4    1
01-01-20T12:08  3    4")

#timestamps/dates have to be of posix- or date-class to be able 
#to roll-join them
df1[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]
df2[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]

code

df2[df1, roll = "nearest", on = .(Timestamp)]

#              Timestamp Var851 Var852 Var1 Var2
# 1: 2020-01-01 10:47:00      4      1    7    8
# 2: 2020-01-01 11:50:00      3      4    6    4

Map two dataframes, count events where timestamps in second dataframe are within the date-time ranges of the first dataframe

Here is one way to do it (I've left intermediate print-outs to help better understand what's happening at each step):

# Setup
df["start_ts"] = pd.to_datetime(df["start_ts"], format="%Y-%m-%d %H:%M:%s")
df["end_ts"] = pd.to_datetime(df["end_ts"], format="%Y-%m-%d %H:%M:%s")
df2["timestamp"] = pd.to_datetime(df2["timestamp"], format="%Y-%m-%d %H:%M:%s")

# Find matching intervals
for idx in df["person_id"].unique():
    df2.loc[df2["person_id"] == idx, "interval"] = df2.loc[
        df2["person_id"] == idx, "timestamp"
    ].map(
        lambda x: df.loc[
            (df["person_id"] == idx) & (df["start_ts"] <= x) & (x <= df["end_ts"]),
            ["start_ts", "end_ts"],
        ].index[0]
    )

print(df2.head())
# Output
  person_id           timestamp event  interval
0         A 2022-05-05 01:00:00     1       0.0
1         A 2022-05-05 01:10:00     2       0.0
2         A 2022-05-05 01:30:00     3       0.0
3         A 2022-05-05 06:00:00     1       2.0
4         A 2022-05-05 07:00:00     2       2.0

# Count number of events
df2 = (
    df2.assign(interval=lambda x: x["interval"].astype(int))
    .assign(count=1)
    .groupby(["person_id", "interval", "event"])
    .agg({"count": sum})
    .reset_index(drop=False)
    .pivot(index=["person_id", "interval"], columns="event")
    .reset_index(drop=False)
)
df2.columns = df2.columns.droplevel()
df2.columns = [
    "person_id",
    "interval",
    "count_event_1",
    "count_event_2",
    "count_event_3",
]

print(df2)
# Output
  person_id  interval    1    2    3
0         A         0  1.0  1.0  1.0
1         A         2  1.0  2.0  NaN
2         A         4  2.0  NaN  1.0
3         B         1  NaN  1.0  2.0
4         C         3  2.0  NaN  1.0
5         C         5  1.0  1.0  1.0

# Final dataframe
df = df.reset_index(drop=False).rename(columns={"index": "interval"})
df = (
    pd.merge(left=df, right=df2, on=["person_id", "interval"])
    .fillna(0)
    .astype(int, errors="ignore")
)

print(df)
# Output as expected
   interval person_id  location_id            start_ts              end_ts  count_event_1  count_event_2  count_event_3
0         0         A            1 2022-05-05 00:00:00 2022-05-05 02:00:00              1              1              1
1         1         B            5 2022-05-05 00:00:00 2022-05-05 03:00:00              0              1              2
2         2         A            2 2022-05-05 05:00:00 2022-05-05 10:00:00              1              2              0
3         3         C            7 2022-05-05 00:00:00 2022-05-05 04:00:00              2              0              1
4         4         A            3 2022-05-05 13:00:00 2022-05-05 16:00:00              2              0              1
5         5         C            8 2022-05-05 11:00:00 2022-05-05 12:00:00              1              1              1

Combine two datasets with Interval time condition in R (I would like to avoid combinations and just have unique matches)

May be, we need to do a crossing and then filter after converting to DateTime class

library(dplyr)
library(tidyr)
library(lubridate)
crossing(endtime = as.POSIXct(df1$endtime,format ="%m/%d/%Y %I:%M:%S %p" ), 
           sent = as.POSIXct(df2$sent, format = "%m/%d/%Y %I:%M:%S %p")) %>% 
     filter((endtime - seconds(20)) <= sent, 
                 (endtime + seconds(20)) >= (sent)) %>%
     mutate_all(format, format = "%m/%d/%Y %I:%M:%S %p") %>%
     distinct
# A tibble: 1 x 2
#  endtime                sent                  
#  <chr>                  <chr>                 
#1 01/07/2020 01:35:08 AM 01/07/2020 01:35:20 AM