Merge two dataframes if timestamp of x is within time interval of y
You could use sqldf:
library(sqldf)
df<-sqldf('select d1.*,d2.*
from DF1 d1
left join DF2 d2
on d1.Timestamp >= d2.onebef
AND d1.Timestamp <= d2.oneaft
')
df
Merging two dataframes with different timestamp (different time interval)
Let's try pd.cut
:
lower_bounds = pd.cut(df2['time'],
bins=list(df1['time']) + [pd.to_datetime('2050-01-01')],
right=False, include_lowest=True,
labels=df1['time'])
df1['ratings'] = (df2.groupby(lower_bounds)
['rating'].mean()
.reindex(df1['time'])
.values
)
Or you can use merge_asof
:
df1['ratings'] = pd.merge_asof(df2, df1.reset_index(),
on='time',
direction='backward'
).groupby('index')['rating'].mean()
Merge two dataframes with different time ranges by repeating values within time range in R
You can use a rolling join
library(data.table)
setDT(df1)
setDT(df2)
df2[df1, on = .(time), roll = TRUE]
# time Temp.ar Ur.ar Vel.Vento x y z
# 1: 2019-12-11 06:08:03 14.79 78.5 1.147 -52 -39 -35
# 2: 2019-12-11 06:08:08 14.79 78.5 1.147 -47 -57 -36
# 3: 2019-12-11 06:08:13 14.79 78.5 1.147 -39 2 -40
# 4: 2019-12-11 06:10:20 14.74 78.9 1.045 -45 -23 -29
# 5: 2019-12-11 06:10:29 14.74 78.9 1.045 -51 -11 -31
# 6: 2019-12-11 06:20:34 14.90 78.9 1.009 -69 -28 -19
Data used
df1 <- fread('
x y z time
-52 -39 -35 06:08:03
-47 -57 -36 06:08:08
-39 2 -40 06:08:13
-45 -23 -29 06:10:20
-51 -11 -31 06:10:29
-69 -28 -19 06:20:34
')
df2 <- fread('
time Temp.ar Ur.ar Vel.Vento
06:00:00 14.79 78.5 1.147
06:10:00 14.74 78.9 1.045
06:20:00 14.9 78.9 1.009
06:30:00 15.14 78.6 1.076
06:40:00 15.32 77.8 1.332
06:50:00 15.6 76.5 1.216
')
Merge two dataframes based on similar time interval in R
You could use foverlaps
from data.table
package
library(data.table)
# convert to data.table
setDT(df1)
setDT(df2)
# Character dates to numeric dates
df1[,START:=as.POSIXct(START)]
df1[,END:=as.POSIXct(END)]
df2[,start_time:=as.POSIXct(start_time)]
df2[,endtime:=as.POSIXct(endtime)]
# key needed for second data.table
setkey(df2,id,start_time,endtime)
# Overlap join
foverlaps(df1,df2,by.x = c("id","START","END"))[
,.(id,STREAM_1,STREAM_2,STREAM_3=code,START,END)][
,.(STREAM_3=first(STREAM_3)),by=.(id,STREAM_1,STREAM_2,START,END)]
id STREAM_1 STREAM_2 START END STREAM_3
<int> <char> <char> <POSc> <POSc> <char>
1: 401 NIVH-OFF IN 2022-08-16 02:00:00 2022-08-16 03:35:00 <NA>
2: 401 NIVH-OFF IN 2022-08-16 03:35:00 2022-08-16 07:22:45 <NA>
3: 401 NIVH-ON IN 2022-08-16 07:22:45 2022-08-16 07:31:45 <NA>
4: 401 NIVH-DOCK IN 2022-08-16 07:31:45 2022-08-16 07:43:30 <NA>
5: 401 NIVH-ON IN 2022-08-16 07:43:30 2022-08-16 07:45:00 <NA>
6: 401 INVH-ON IN 2022-08-16 07:45:00 2022-08-16 07:49:00 12be4
7: 401 INVH-ON OUT 2022-08-16 07:49:00 2022-08-16 08:50:00 12be4
8: 401 INVH-ON IN 2022-08-16 08:50:00 2022-08-16 08:56:00 <NA>
9: 401 NIVH-ON IN 2022-08-16 08:56:00 2022-08-16 08:58:00 <NA>
10: 401 INVH-ON IN 2022-08-16 08:58:00 2022-08-16 09:00:00 <NA>
...
Merge dataframes based on interval condition
You can use sqldf
for complex joins:
require(sqldf)
sqldf("SELECT df1.*,df2.date,df2.id as id2
FROM df1
LEFT JOIN df2
ON df1.id = df2.id AND
df1.start < df2.date AND
df1.end > df2.date")
Join two dataframes with several columns in R based on closest timestamp
Enter the world of data.table
's rolling joins
sample data
#or use
# setDT(df1); setDT(df2)
#to convert existing data.frame df1 and df2 to data.table
library( data.table)
df1 <- data.table::fread("Timestamp Var1 Var2
01-01-20T10:47 7 8
01-01-20T11:50 6 4")
df2 <- data.table::fread("Timestamp Var851 Var852
01-01-20T10:55 4 1
01-01-20T12:08 3 4")
#timestamps/dates have to be of posix- or date-class to be able
#to roll-join them
df1[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]
df2[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]
code
df2[df1, roll = "nearest", on = .(Timestamp)]
# Timestamp Var851 Var852 Var1 Var2
# 1: 2020-01-01 10:47:00 4 1 7 8
# 2: 2020-01-01 11:50:00 3 4 6 4
Map two dataframes, count events where timestamps in second dataframe are within the date-time ranges of the first dataframe
Here is one way to do it (I've left intermediate print-outs to help better understand what's happening at each step):
# Setup
df["start_ts"] = pd.to_datetime(df["start_ts"], format="%Y-%m-%d %H:%M:%s")
df["end_ts"] = pd.to_datetime(df["end_ts"], format="%Y-%m-%d %H:%M:%s")
df2["timestamp"] = pd.to_datetime(df2["timestamp"], format="%Y-%m-%d %H:%M:%s")
# Find matching intervals
for idx in df["person_id"].unique():
df2.loc[df2["person_id"] == idx, "interval"] = df2.loc[
df2["person_id"] == idx, "timestamp"
].map(
lambda x: df.loc[
(df["person_id"] == idx) & (df["start_ts"] <= x) & (x <= df["end_ts"]),
["start_ts", "end_ts"],
].index[0]
)
print(df2.head())
# Output
person_id timestamp event interval
0 A 2022-05-05 01:00:00 1 0.0
1 A 2022-05-05 01:10:00 2 0.0
2 A 2022-05-05 01:30:00 3 0.0
3 A 2022-05-05 06:00:00 1 2.0
4 A 2022-05-05 07:00:00 2 2.0
# Count number of events
df2 = (
df2.assign(interval=lambda x: x["interval"].astype(int))
.assign(count=1)
.groupby(["person_id", "interval", "event"])
.agg({"count": sum})
.reset_index(drop=False)
.pivot(index=["person_id", "interval"], columns="event")
.reset_index(drop=False)
)
df2.columns = df2.columns.droplevel()
df2.columns = [
"person_id",
"interval",
"count_event_1",
"count_event_2",
"count_event_3",
]
print(df2)
# Output
person_id interval 1 2 3
0 A 0 1.0 1.0 1.0
1 A 2 1.0 2.0 NaN
2 A 4 2.0 NaN 1.0
3 B 1 NaN 1.0 2.0
4 C 3 2.0 NaN 1.0
5 C 5 1.0 1.0 1.0
# Final dataframe
df = df.reset_index(drop=False).rename(columns={"index": "interval"})
df = (
pd.merge(left=df, right=df2, on=["person_id", "interval"])
.fillna(0)
.astype(int, errors="ignore")
)
print(df)
# Output as expected
interval person_id location_id start_ts end_ts count_event_1 count_event_2 count_event_3
0 0 A 1 2022-05-05 00:00:00 2022-05-05 02:00:00 1 1 1
1 1 B 5 2022-05-05 00:00:00 2022-05-05 03:00:00 0 1 2
2 2 A 2 2022-05-05 05:00:00 2022-05-05 10:00:00 1 2 0
3 3 C 7 2022-05-05 00:00:00 2022-05-05 04:00:00 2 0 1
4 4 A 3 2022-05-05 13:00:00 2022-05-05 16:00:00 2 0 1
5 5 C 8 2022-05-05 11:00:00 2022-05-05 12:00:00 1 1 1
Combine two datasets with Interval time condition in R (I would like to avoid combinations and just have unique matches)
May be, we need to do a crossing
and then filter
after converting to DateTime
class
library(dplyr)
library(tidyr)
library(lubridate)
crossing(endtime = as.POSIXct(df1$endtime,format ="%m/%d/%Y %I:%M:%S %p" ),
sent = as.POSIXct(df2$sent, format = "%m/%d/%Y %I:%M:%S %p")) %>%
filter((endtime - seconds(20)) <= sent,
(endtime + seconds(20)) >= (sent)) %>%
mutate_all(format, format = "%m/%d/%Y %I:%M:%S %p") %>%
distinct
# A tibble: 1 x 2
# endtime sent
# <chr> <chr>
#1 01/07/2020 01:35:08 AM 01/07/2020 01:35:20 AM
Related Topics
How to Remove Rows with All Zeros Without Using Rowsums in R
What Are Some Good Books, Web Resources, and Projects for Learning R
Rearrange Dataframe to a Table, the Opposite of "Melt"
Read Lines by Number from a Large File
What Does ..Level.. Mean in Ggplot::Stat_Density2D
How to Add Chapter Bibliographies Using Bookdown
Force No Default Selection in Selectinput()
Adding Vertical Line in Plot Ggplot
Loops in R - Need to Use Index, Anyway to Avoid 'For'
Rstudio Empty on Startup - No Windows, No Menus, No Rendering
R: Expand and Fill Data Frame by Date in Series
Return Row Number(S) for a Particular Value in a Column in a Dataframe
What Is a Fast Way to Set Debugging Code at a Given Line in a Function
Understanding Color Scales in Ggplot2
Handling Missing/Incomplete Data in R--Is There Function to Mask But Not Remove Nas