Join two data frames in R based on closest timestamp
Using rolling joins feature of data.table
with roll = "nearest"
:
require(data.table) # v1.9.6+
setDT(table1)[, val2 := setDT(table2)[table1, val2, on = "date", roll = "nearest"]]
Here, val2
column is created by performing a join on the column date
with roll = "nearest"
option. For each row of table1$date
, the closest matching row from table2$date
is computed, and val2
for corresponding row is extracted.
Left join two dataframes based on nearest timestamp in R
Here is a roundabout way using dplyr
library(dplyr)
df1 %>%
left_join(df2 %>%
left_join(df1) %>%
mutate(date_diff = abs(date2 - date1)) %>%
group_by(ID, date2) %>%
filter(date_diff == min(date_diff)) %>%
ungroup() %>%
select(-date2, -date_diff) ) %>%
mutate(Flag = case_when(is.na(Flag) ~ "No",
TRUE ~ Flag))
Joining, by = "ID"
Joining, by = c("ID", "date1")
# A tibble: 7 x 3
ID date1 Flag
<dbl> <dttm> <chr>
1 1 2020-07-11 19:14:23 Yes
2 1 2020-07-21 13:11:10 Yes
3 1 2020-07-21 18:07:25 No
4 1 2020-07-28 18:18:11 No
5 2 2020-07-13 16:47:26 Yes
6 2 2020-07-18 17:11:37 Yes
7 3 2020-07-23 10:39:19 No
Join two dataframes with several columns in R based on closest timestamp
Enter the world of data.table
's rolling joins
sample data
#or use
# setDT(df1); setDT(df2)
#to convert existing data.frame df1 and df2 to data.table
library( data.table)
df1 <- data.table::fread("Timestamp Var1 Var2
01-01-20T10:47 7 8
01-01-20T11:50 6 4")
df2 <- data.table::fread("Timestamp Var851 Var852
01-01-20T10:55 4 1
01-01-20T12:08 3 4")
#timestamps/dates have to be of posix- or date-class to be able
#to roll-join them
df1[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]
df2[, Timestamp := as.POSIXct( Timestamp, format = "%d-%m-%yT%H:%M")]
code
df2[df1, roll = "nearest", on = .(Timestamp)]
# Timestamp Var851 Var852 Var1 Var2
# 1: 2020-01-01 10:47:00 4 1 7 8
# 2: 2020-01-01 11:50:00 3 4 6 4
Find closest timestamps between two dataframes and merge different columns when time difference is 60s
This can be directly expressed in sql:
library(sqldf)
sqldf("select a.*, b.Data df1_Data
from df2 a
left join df1 b on abs(a.Timestamp - b.Timestamp) < 60")
giving:
Timestamp Data df1_Data
1 2019-12-31 19:00:10 10 1
2 2019-12-31 19:02:30 11 2
3 2019-12-31 19:12:45 12 7
4 2019-12-31 19:20:15 13 NA
Matching two dataframes based on a common ID and the closest timestamp (less than 5 min) in R
Using the dfata frames shown reproducibly in the Note at the end, perform a left join using the indicated condition grouping by dat1 rows and taking the minimum seconds difference over the matched rows. Remove the seconds difference column (4) at the end. Note that there are 60 * 5 seconds in 5 minutes.
library(sqldf)
out <- sqldf("select a.ID,
a.timestamp,
b.timestamp [timestamp.y],
min(abs(a.timestamp - b.timestamp)) seconds,
b.x1,
b.x2
from dat1 a
left join dat2 b on a.ID = b.ID and
abs(a.timestamp - b.timestamp) < 60 * 5
group by a.rowid")[-4]
out$timestamp.y <- as.POSIXct(out$timestamp.y, origin = "1970-01-01")
# check
all.equal(out, target)
## [1] TRUE
Note
Assume the inputs and target below shown reproducibly. Note that the timestamp columns have POSIXct class.
dat1 <-
structure(list(ID = 1:3, timestamp = structure(c(1603708643,
1603712596, 1603719412), class = c("POSIXct", "POSIXt"), tzone = "")),
row.names = c(NA, -3L), class = "data.frame")
dat2 <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L),
timestamp = structure(c(1603713323, 1603708763, 1603715319,
1603724172, 1603751992, 1603719472, 1603721777,
1603799685, 1603801544), class = c("POSIXct", "POSIXt"), tzone = ""),
x1 = c("a", "b", "c", "d", "e", "a", "b", "c", "d"), x2 = c("c",
"b", "e", "a", "e", "a", "f", "d", "a")), row.names = c(NA,
-9L), class = "data.frame")
target <-
structure(list(ID = 1:3, timestamp = structure(c(1603708643,
1603712596, 1603719412), class = c("POSIXct", "POSIXt")),
timestamp.y = structure(c(1603708763,
NA, 1603719472), class = c("POSIXct", "POSIXt")),
x1 = c("b", NA, "a"), x2 = c("b", NA, "a")), row.names = c(NA,
-3L), class = "data.frame")
Join two data frames in R based on closest timestamp within groups
We can join the data frames by id
and then calculate the time difference and keep the observation with the minimal time difference for each individual:
library(tidyverse)
df2 %>%
left_join(df1, by = "id") %>%
mutate(time_dif = abs(time.x - time.y)) %>%
group_by(id) %>%
filter(time_dif == min(time_dif))
# A tibble: 5 x 5
# Groups: id [5]
id time.x score time.y time_dif
<dbl> <dttm> <dbl> <dttm> <drtn>
1 1 2017-01-25 00:00:00 4 2017-01-11 00:00:00 14 days
2 2 2017-02-02 00:00:00 26 2017-02-01 00:00:00 1 days
3 3 2017-06-15 00:00:00 38 2017-03-23 00:00:00 84 days
4 4 2017-06-20 00:00:00 45.2 2017-03-31 00:00:00 81 days
5 5 2017-01-31 00:00:00 61.4 2017-04-12 00:00:00 71 days
Data
df1 <- structure(list(id = c(1, 2, 3, 4, 5), time = structure(c(1484092800,
1485907200, 1490227200, 1490918400, 1491955200), class = c("POSIXct",
"POSIXt"), tzone = "UTC")), row.names = c(NA, -5L), class = c("tbl_df",
"tbl", "data.frame"))
df2 <- structure(list(id = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3,
3, 3, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5), time = structure(c(1466553600,
1465948800, 1453420800, 1485302400, 1433030400, 1421712000, 1453852800,
1485302400, 1485993600, 1517529600, 1400544000, 1434067200, 1466985600,
1497484800, 1390003200, 1516060800, 1464825600, 1497916800, 1527638400,
1454025600, 1390608000, 1421712000, 1466467200, 1453852800, 1485820800
), class = c("POSIXct", "POSIXt"), tzone = "UTC"), score = c(3,
2, 5, 4, 5, 24.2, 24.8, 25.4, 26, 26.6, 36.2, 36.8, 37.4, 38,
38.6, 44, 44.6, 45.2, 45.8, 46.4, 59, 59.6, 60.2, 60.8, 61.4)), row.names = c(NA,
-25L), class = c("tbl_df", "tbl", "data.frame"))
Merge two dataframes by nearest date in R
I suggest two approaches. The first uses a distance matrix and perform a left_join of df1 to df2. Namely the distance matrix is given by:
dateDist <- outer(pull(df1, date), pull(df2, date), "-") %>%
abs()
Next, for each row of df1
, the row of df2
with closest distance is given by:
closest.df1 <- apply(dateDist, 1, which.min)
Finally, the merge is performed manually:
cbind(rename_with(df1, ~paste0("df1.", "", .x)),
rename_with(df2[closest.df1,], ~paste0("df2.", "", .x)))
##>+ df1.date df1.value df2.date df2.value
##>1 2021-11-23 20:56:06 500 2021-11-23 20:55:47 Ship Emma
##>1.1 2021-11-23 20:56:07 900 2021-11-23 20:55:47 Ship Emma
##>1.2 2021-11-23 20:56:08 1000 2021-11-23 20:55:47 Ship Emma
##>1.3 2021-11-23 20:56:09 200 2021-11-23 20:55:47 Ship Emma
##>1.4 2021-11-23 20:56:10 300 2021-11-23 20:55:47 Ship Emma
##>1.5 2021-11-23 20:56:11 10 2021-11-23 20:55:47 Ship Emma
##>5 2021-11-23 22:13:56 1000 2021-11-23 22:16:01 Ship Amy
##>5.1 2021-11-23 22:13:57 450 2021-11-23 22:16:01 Ship Amy
##>5.2 2021-11-23 22:13:58 950 2021-11-23 22:16:01 Ship Amy
##>5.3 2021-11-23 22:13:59 600 2021-11-23 22:16:01 Ship Amy
##>12 2021-11-24 03:23:21 100 2021-11-24 03:23:37 Ship Sally
##>12.1 2021-11-24 03:23:22 750 2021-11-24 03:23:37 Ship Sally
##>12.2 2021-11-24 03:23:23 150 2021-11-24 03:23:37 Ship Sally
##>12.3 2021-11-24 03:23:24 200 2021-11-24 03:23:37 Ship Sally
##>12.4 2021-11-24 03:23:25 300 2021-11-24 03:23:37 Ship Sally
##>12.5 2021-11-24 03:24:34 400 2021-11-24 03:23:37 Ship Sally
##>12.6 2021-11-24 03:24:35 900 2021-11-24 03:23:37 Ship Sally
##>12.7 2021-11-24 03:24:36 1020 2021-11-24 03:23:37 Ship Sally
##>12.8 2021-11-24 03:24:37 800 2021-11-24 03:23:37 Ship Sally
The second approach involves first calculating the cartesian product of all the rows of df1
and df2
and then selecting only the rows with the minimum distance. The trick here is to use inner_join(..., by =character())
to get all the combinations of the two dataframes :
mutate(df1, id = row_number()) %>%
inner_join(mutate(df2, id = row_number()),by = character()) |>
mutate(dist = abs(date.x - date.y)) |>
group_by(id.x) |>
filter(dist == min(dist)) |>
select(-id.x, -id.y, -dist)
##>+ # A tibble: 19 × 7
##># Groups: id.x [19]
##> date.x value.x id.x date.y value.y id.y dist
##> <dttm> <dbl> <int> <dttm> <chr> <int> <drtn>
##> 1 2021-11-23 20:56:06 500 1 2021-11-23 20:55:47 Ship Emma 1 19 s…
##> 2 2021-11-23 20:56:07 900 2 2021-11-23 20:55:47 Ship Emma 1 20 s…
##> 3 2021-11-23 20:56:08 1000 3 2021-11-23 20:55:47 Ship Emma 1 21 s…
##> 4 2021-11-23 20:56:09 200 4 2021-11-23 20:55:47 Ship Emma 1 22 s…
##> 5 2021-11-23 20:56:10 300 5 2021-11-23 20:55:47 Ship Emma 1 23 s…
##> 6 2021-11-23 20:56:11 10 6 2021-11-23 20:55:47 Ship Emma 1 24 s…
##> 7 2021-11-23 22:13:56 1000 7 2021-11-23 22:16:01 Ship Amy 5 125 s…
##> 8 2021-11-23 22:13:57 450 8 2021-11-23 22:16:01 Ship Amy 5 124 s…
##> 9 2021-11-23 22:13:58 950 9 2021-11-23 22:16:01 Ship Amy 5 123 s…
##>10 2021-11-23 22:13:59 600 10 2021-11-23 22:16:01 Ship Amy 5 122 s…
##>11 2021-11-24 03:23:21 100 11 2021-11-24 03:23:37 Ship Sally 12 16 s…
##>12 2021-11-24 03:23:22 750 12 2021-11-24 03:23:37 Ship Sally 12 15 s…
##>13 2021-11-24 03:23:23 150 13 2021-11-24 03:23:37 Ship Sally 12 14 s…
##>14 2021-11-24 03:23:24 200 14 2021-11-24 03:23:37 Ship Sally 12 13 s…
##>15 2021-11-24 03:23:25 300 15 2021-11-24 03:23:37 Ship Sally 12 12 s…
##>16 2021-11-24 03:24:34 400 16 2021-11-24 03:23:37 Ship Sally 12 57 s…
##>17 2021-11-24 03:24:35 900 17 2021-11-24 03:23:37 Ship Sally 12 58 s…
##>18 2021-11-24 03:24:36 1020 18 2021-11-24 03:23:37 Ship Sally 12 59 s…
##>19 2021-11-24 03:24:37 800 19 2021-11-24 03:23:37 Ship Sally 12 60 s…
How to join two dataframes by nearest time-date?
data.table
should work for this (can you explain the error you're coming up against?), although it does tend to convert POSIXlt to POSIXct on its own (perhaps do that conversion on your datetime column manually to keep data.table
happy). Also make sure you're setting the key column before using roll
.
(I've created my own example tables here to make my life that little bit easier. If you want to use dput on yours, I'm happy to update this example with your data):
new <- data.table( date = as.POSIXct( c( "2016-03-02 12:20:00", "2016-03-07 12:20:00", "2016-04-02 12:20:00" ) ), data.new = c( "t","u","v" ) )
head( new, 2 )
date data.new
1: 2016-03-02 12:20:00 t
2: 2016-03-07 12:20:00 u
old <- data.table( date = as.POSIXct( c( "2016-03-02 12:20:00", "2016-03-07 12:20:00", "2016-04-02 12:20:00", "2015-03-02 12:20:00" ) ), data.old = c( "a","b","c","d" ) )
head( old, 2 )
date data.old
1: 2016-03-02 12:20:00 a
2: 2016-03-07 12:20:00 b
setkey( new, date )
setkey( old, date )
combined <- new[ old, roll = "nearest" ]
combined
date data.new data.old
1: 2015-03-02 12:20:00 t d
2: 2016-03-02 12:20:00 t a
3: 2016-03-07 12:20:00 u b
4: 2016-04-02 12:20:00 v c
I've intentionally made the two tables different row lengths, in order to show how the rolling join deals with multiple matches. You can switch the way it joins with:
combined <- old[ new, roll = "nearest" ]
combined
date data.old data.new
1: 2016-03-02 12:20:00 a t
2: 2016-03-07 12:20:00 b u
3: 2016-04-02 12:20:00 c v
Related Topics
Display Exact Value of a Variable in R
Lme4::Lmer Reports "Fixed-Effect Model Matrix Is Rank Deficient", Do I Need a Fix and How To
If Else Condition in Ggplot to Add an Extra Layer
Dplyr - Using Mutate() Like Rowmeans()
Assign Value to Group Based on Condition in Column
How to Change Type of Target Column When Doing := by Group in a Data.Table in R
Complete Column with Group_By and Complete
Add a New Column to a Dataframe Using Matching Values of Another Dataframe
Ggplot2 Does Not Appear to Work When Inside a Function R
Plot a Function with Ggplot, Equivalent of Curve()
Controlling Order of Facet_Grid/Facet_Wrap in Ggplot2
How to Delete Rows from a Data.Frame, Based on an External List, Using R
How to Make Variable Bar Widths in Ggplot2 Not Overlap or Gap
How to Do Range Grouping on a Column Using Dplyr
Replace Duplicated Elements with Na, Instead of Removing Them