Join data.table on exact date or if not the case on the nearest less than date
Here you go:
library(data.table)
Create the data:
Dt1 <- read.table(text="
date x
1/26/2010, 10
1/25/2010, 9
1/24/2010, 9
1/22/2010, 7
1/19/2010, 11", header=TRUE, stringsAsFactors=FALSE)
Dt2 <- read.table(text="
date
1/26/2010
1/23/2010
1/20/2010", header=TRUE, stringsAsFactors=FALSE)
Convert to data.table
, convert strings to dates, and set the data.table
key:
Dt1 <- data.table(Dt1)
Dt2 <- data.table(Dt2)
Dt1[, date:=as.Date(date, format=("%m/%d/%Y"))]
Dt2[, date:=as.Date(date, format=("%m/%d/%Y"))]
setkey(Dt1, date)
setkey(Dt2, date)
Join the tables, using roll=TRUE
:
Dt1[Dt2, roll=TRUE]
date x
[1,] 2010-01-20 11
[2,] 2010-01-23 7
[3,] 2010-01-26 10
Join two data.tables on date, with closest date in table 1 strictly less than date in second table
Rolling join with -1L
offset.
Update 2016-04-02: With this commit in current devel, v1.9.7, this can be done without creating a temporary column. From NEWS:
x's columns can be referred to in
j
using the prefixx.
at all times. This is particularly useful when it is necessary to x's column that is also a join column. This is a patch addressing #1615.
Dt2[, jndate := date - 1L]
Dt1[Dt2,
.(date = i.date, orgdate = x.date, x),
on = c("date" = "jndate"),
roll = Inf]
# date orgdate x
#1: 2010-01-26 2010-01-25 9
#2: 2010-01-23 2010-01-22 7
#3: 2010-01-20 2010-01-19 11
Original answer, useful if you are on 1.9.6 or older.
library(data.table)
# data
Dt1 = fread("date x
1/26/2010, 10
1/25/2010, 9
1/24/2010, 9
1/22/2010, 7
1/19/2010, 11")[, date := as.IDate(date, format=("%m/%d/%Y"))][]
Dt2 = fread("date
1/26/2010
1/23/2010
1/20/2010")[, date := as.IDate(date, format=("%m/%d/%Y"))][]
# solution
offset.roll.join = function(Dt1, Dt2){
Dt2[, jndate := date - 1L] # produce join column with offset
Dt1[, orgdate := date] # should not be needed after data.table#1615
on.exit({Dt2[, jndate := NULL]; Dt1[, orgdate := NULL]}) # cleanup on exit
Dt1[Dt2, .(date = i.date, orgdate, x), on = c("date" = "jndate"), roll = Inf] # do rolling join
}
offset.roll.join(Dt1, Dt2)
# date orgdate x
#1: 2010-01-26 2010-01-25 9
#2: 2010-01-23 2010-01-22 7
#3: 2010-01-20 2010-01-19 11
SQL Join on Nearest less than date
I believe this subquery will do it (not tested).
select *,
(select top 1 Discount
from table2
where table2.Date <= t.Date
order by table2.Date desc) as Discount
from Table1 t
Perhaps not the most performant however.
Edit:
Test code:
create table #table1 ([date] datetime, val int)
create table #table2 ([date] datetime, discount int)
insert into #table1 ([date], val) values ('1/26/2010', 10)
insert into #table1 ([date], val) values ('1/25/2010', 9)
insert into #table1 ([date], val) values ('1/24/2010', 8)
insert into #table1 ([date], val) values ('1/24/2010', 9)
insert into #table1 ([date], val) values ('1/23/2010', 7)
insert into #table1 ([date], val) values ('1/22/2010', 10)
insert into #table1 ([date], val) values ('1/21/2010', 11)
insert into #table2 ([date], discount) values ('1/26/2010', 2)
insert into #table2 ([date], discount) values ('1/23/2010', 1)
insert into #table2 ([date], discount) values ('1/20/2010', 0)
select *,
(select top 1 discount
from #table2
where #table2.[date] <= t.[date]
order by #table2.[date] desc) as discount
from #table1 t
drop table #table1
drop table #table2
Results:
2010-01-26 00:00:00.000 10 2
2010-01-25 00:00:00.000 9 1
2010-01-24 00:00:00.000 8 1
2010-01-24 00:00:00.000 9 1
2010-01-23 00:00:00.000 7 1
2010-01-22 00:00:00.000 10 0
2010-01-21 00:00:00.000 11 0
Join R data.tables where key values are not exactly equal--combine rows with closest times
Another option may be roll='nearest'
(new in v1.8.8 on CRAN).
> setkey(DT1,x,time)
> DT1
x time v
1: a 10 1
2: a 30 2
3: a 60 3
4: b 10 4
5: b 30 5
6: b 60 6
7: c 10 7
8: c 30 8
9: c 60 9
> DT2
x time
1: a 17
2: b 54
3: c 3
> DT1[DT2,roll="nearest"]
x time v
1: a 17 1
2: b 54 6
3: c 3 7
Note that 17 appears to be closer to 10 than 30, hence the result in the first row.
If you need to roll to the next observation (next observation carried backwards) :
> DT1[DT2,roll=-Inf]
x time v
1: a 17 2
2: b 54 6
3: c 3 7
Join two tables on the same date or closest date (before or after)
Now that we're considering both past and future [Lead_Date] values I've tweaked the test data cover a special case
Table: Test_PI
CustomerID Product_Interest_Date Product_Interest
---------- --------------------- ----------------
1 2014-09-07 Interest1
1 2014-09-08 Interest2
1 2014-09-15 Interest3
1 2014-09-28 Interest4
Table: Test_Leads
CustomerID Lead_Date Lead_Source
---------- ---------- -----------
1 2014-09-07 Source1
1 2014-09-14 Source2
2 2014-09-15 Source3
1 2014-09-16 Source4
We'll start by creating a saved Access query named [Date_Gaps]
SELECT
pi.CustomerID,
pi.Product_Interest_Date,
l.Lead_Date,
Abs(DateDiff("d", pi.Product_Interest_Date, l.Lead_Date)) AS Date_Gap
FROM
Test_PI pi
INNER JOIN
Test_Leads l
ON pi.CustomerID = l.CustomerID
returning
CustomerID Product_Interest_Date Lead_Date Date_Gap
---------- --------------------- ---------- --------
1 2014-09-07 2014-09-07 0
1 2014-09-08 2014-09-07 1
1 2014-09-15 2014-09-07 8
1 2014-09-28 2014-09-07 21
1 2014-09-07 2014-09-14 7
1 2014-09-08 2014-09-14 6
1 2014-09-15 2014-09-14 1
1 2014-09-28 2014-09-14 14
1 2014-09-07 2014-09-16 9
1 2014-09-08 2014-09-16 8
1 2014-09-15 2014-09-16 1
1 2014-09-28 2014-09-16 12
Now the query
SELECT
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date
returns
CustomerID Product_Interest_Date MinOfDate_Gap
---------- --------------------- -------------
1 2014-09-07 0
1 2014-09-08 1
1 2014-09-15 1
1 2014-09-28 12
so if we simply join back into the [Date_Gaps] query to get the [Lead_Date]
SELECT
mingap.CustomerID,
mingap.Product_Interest_Date,
Date_Gaps.Lead_Date
FROM
Date_Gaps
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date
) mingap
ON Date_Gaps.CustomerID = mingap.CustomerID
AND Date_Gaps.Product_Interest_Date = mingap.Product_Interest_Date
AND Date_Gaps.Date_Gap = mingap.MinOfDate_Gap
we get
CustomerID Product_Interest_Date Lead_Date
---------- --------------------- ----------
1 2014-09-07 2014-09-07
1 2014-09-08 2014-09-07
1 2014-09-15 2014-09-14
1 2014-09-15 2014-09-16
1 2014-09-28 2014-09-16
Notice that we get two hits for 09-15 because they both have a gap of 1 day (before and after). So, we need to break that tie by wrapping the above query in an aggregation query using Min(Lead_Date)
(or Max(Lead_Date)
, your choice)
SELECT
CustomerID,
Product_Interest_Date,
Min(Lead_Date) AS MinOfLead_Date
FROM
(
SELECT
mingap.CustomerID,
mingap.Product_Interest_Date,
Date_Gaps.Lead_Date
FROM
Date_Gaps
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date
) mingap
ON Date_Gaps.CustomerID = mingap.CustomerID
AND Date_Gaps.Product_Interest_Date = mingap.Product_Interest_Date
AND Date_Gaps.Date_Gap = mingap.MinOfDate_Gap
)
GROUP BY
CustomerID,
Product_Interest_Date
to give us
CustomerID Product_Interest_Date MinOfLead_Date
---------- --------------------- --------------
1 2014-09-07 2014-09-07
1 2014-09-08 2014-09-07
1 2014-09-15 2014-09-14
1 2014-09-28 2014-09-16
So now we're ready to JOIN up the original tables
SELECT
Test_PI.CustomerID,
Test_PI.Product_Interest_Date,
Test_PI.Product_Interest,
Test_Leads.Lead_Date,
Test_Leads.Lead_Source
FROM
(
Test_PI
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Lead_Date) AS MinOfLead_Date
FROM
(
SELECT
mingap.CustomerID,
mingap.Product_Interest_Date,
Date_Gaps.Lead_Date
FROM
Date_Gaps
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date
) mingap
ON Date_Gaps.CustomerID = mingap.CustomerID
AND Date_Gaps.Product_Interest_Date = mingap.Product_Interest_Date
AND Date_Gaps.Date_Gap = mingap.MinOfDate_Gap
)
GROUP BY
CustomerID,
Product_Interest_Date
) closest
ON Test_PI.CustomerID = closest.CustomerID
AND Test_PI.Product_Interest_Date = closest.Product_Interest_Date
)
INNER JOIN
Test_Leads
ON Test_Leads.CustomerID = closest.CustomerID
AND Test_Leads.Lead_Date = closest.MinOfLead_Date
returning
CustomerID Product_Interest_Date Product_Interest Lead_Date Lead_Source
---------- --------------------- ---------------- ---------- -----------
1 2014-09-07 Interest1 2014-09-07 Source1
1 2014-09-08 Interest2 2014-09-07 Source1
1 2014-09-15 Interest3 2014-09-14 Source2
1 2014-09-28 Interest4 2014-09-16 Source4
Join tables on nearest date in the past, in MySQL
SELECT a.id, a.sales, a.date, (SELECT TOP 1 Goal
FROM TableB b WHERE b.date < a.date
ORDER BY b.date DESC) As Goal
FROM TableA a
Going off the nearest date in the past.
Related Topics
Disregarding Simple Warnings/Errors in Trycatch()
R List Files with Multiple Conditions
How to Change the Resolution of a Raster Layer in R
Align Violin Plots with Dodged Box Plots
Collapsing/Hiding Figures in R Markdown
R Dplyr Filter Not Masking Base Filter
Figure Out What Version of R a Function Was Introduced In
Implementation of Parallel Coordinates
Adding Elements to a List in for Loop in R
Aesthetics Must Either Be Length One, or the Same Length as the Dataproblems
How to Draw Gridlines Using Abline() That Are Behind the Data
Change Stringsasfactors Settings for Data.Frame
Dependency 'Slam' Is Not Available When Installing Tm Package
Convert Matrix to Three Column Data.Frame
Random Forest with Classes That Are Very Unbalanced