Join Data.Table on Exact Date or If Not the Case on the Nearest Less Than Date

Join data.table on exact date or if not the case on the nearest less than date

Here you go:

library(data.table)

Create the data:

Dt1 <- read.table(text="
date x
1/26/2010, 10
1/25/2010, 9
1/24/2010, 9
1/22/2010, 7
1/19/2010, 11", header=TRUE, stringsAsFactors=FALSE)

Dt2 <- read.table(text="
date
1/26/2010
1/23/2010
1/20/2010", header=TRUE, stringsAsFactors=FALSE)

Convert to data.table, convert strings to dates, and set the data.table key:

Dt1 <- data.table(Dt1)
Dt2 <- data.table(Dt2)

Dt1[, date:=as.Date(date, format=("%m/%d/%Y"))]
Dt2[, date:=as.Date(date, format=("%m/%d/%Y"))]

setkey(Dt1, date)
setkey(Dt2, date)

Join the tables, using roll=TRUE:

Dt1[Dt2, roll=TRUE]

date x
[1,] 2010-01-20 11
[2,] 2010-01-23 7
[3,] 2010-01-26 10

Join two data.tables on date, with closest date in table 1 strictly less than date in second table

Rolling join with -1L offset.

Update 2016-04-02: With this commit in current devel, v1.9.7, this can be done without creating a temporary column. From NEWS:

x's columns can be referred to in j using the prefix x. at all times. This is particularly useful when it is necessary to x's column that is also a join column. This is a patch addressing #1615.

Dt2[, jndate := date - 1L]
Dt1[Dt2,
.(date = i.date, orgdate = x.date, x),
on = c("date" = "jndate"),
roll = Inf]
# date orgdate x
#1: 2010-01-26 2010-01-25 9
#2: 2010-01-23 2010-01-22 7
#3: 2010-01-20 2010-01-19 11

Original answer, useful if you are on 1.9.6 or older.

library(data.table)

# data
Dt1 = fread("date x
1/26/2010, 10
1/25/2010, 9
1/24/2010, 9
1/22/2010, 7
1/19/2010, 11")[, date := as.IDate(date, format=("%m/%d/%Y"))][]
Dt2 = fread("date
1/26/2010
1/23/2010
1/20/2010")[, date := as.IDate(date, format=("%m/%d/%Y"))][]

# solution
offset.roll.join = function(Dt1, Dt2){
Dt2[, jndate := date - 1L] # produce join column with offset
Dt1[, orgdate := date] # should not be needed after data.table#1615
on.exit({Dt2[, jndate := NULL]; Dt1[, orgdate := NULL]}) # cleanup on exit
Dt1[Dt2, .(date = i.date, orgdate, x), on = c("date" = "jndate"), roll = Inf] # do rolling join
}
offset.roll.join(Dt1, Dt2)
# date orgdate x
#1: 2010-01-26 2010-01-25 9
#2: 2010-01-23 2010-01-22 7
#3: 2010-01-20 2010-01-19 11

SQL Join on Nearest less than date

I believe this subquery will do it (not tested).

select *, 
(select top 1 Discount
from table2
where table2.Date <= t.Date
order by table2.Date desc) as Discount
from Table1 t

Perhaps not the most performant however.

Edit:

Test code:

create table #table1 ([date] datetime, val int)
create table #table2 ([date] datetime, discount int)

insert into #table1 ([date], val) values ('1/26/2010', 10)
insert into #table1 ([date], val) values ('1/25/2010', 9)
insert into #table1 ([date], val) values ('1/24/2010', 8)
insert into #table1 ([date], val) values ('1/24/2010', 9)
insert into #table1 ([date], val) values ('1/23/2010', 7)
insert into #table1 ([date], val) values ('1/22/2010', 10)
insert into #table1 ([date], val) values ('1/21/2010', 11)

insert into #table2 ([date], discount) values ('1/26/2010', 2)
insert into #table2 ([date], discount) values ('1/23/2010', 1)
insert into #table2 ([date], discount) values ('1/20/2010', 0)

select *,
(select top 1 discount
from #table2
where #table2.[date] <= t.[date]
order by #table2.[date] desc) as discount
from #table1 t

drop table #table1
drop table #table2

Results:


2010-01-26 00:00:00.000 10 2
2010-01-25 00:00:00.000 9 1
2010-01-24 00:00:00.000 8 1
2010-01-24 00:00:00.000 9 1
2010-01-23 00:00:00.000 7 1
2010-01-22 00:00:00.000 10 0
2010-01-21 00:00:00.000 11 0

Join R data.tables where key values are not exactly equal--combine rows with closest times

Another option may be roll='nearest' (new in v1.8.8 on CRAN).

> setkey(DT1,x,time)
> DT1
x time v
1: a 10 1
2: a 30 2
3: a 60 3
4: b 10 4
5: b 30 5
6: b 60 6
7: c 10 7
8: c 30 8
9: c 60 9
> DT2
x time
1: a 17
2: b 54
3: c 3
> DT1[DT2,roll="nearest"]
x time v
1: a 17 1
2: b 54 6
3: c 3 7

Note that 17 appears to be closer to 10 than 30, hence the result in the first row.

If you need to roll to the next observation (next observation carried backwards) :

> DT1[DT2,roll=-Inf]
x time v
1: a 17 2
2: b 54 6
3: c 3 7

Join two tables on the same date or closest date (before or after)

Now that we're considering both past and future [Lead_Date] values I've tweaked the test data cover a special case

Table: Test_PI

CustomerID  Product_Interest_Date  Product_Interest
---------- --------------------- ----------------
1 2014-09-07 Interest1
1 2014-09-08 Interest2
1 2014-09-15 Interest3
1 2014-09-28 Interest4

Table: Test_Leads

CustomerID  Lead_Date   Lead_Source
---------- ---------- -----------
1 2014-09-07 Source1
1 2014-09-14 Source2
2 2014-09-15 Source3
1 2014-09-16 Source4

We'll start by creating a saved Access query named [Date_Gaps]

SELECT
pi.CustomerID,
pi.Product_Interest_Date,
l.Lead_Date,
Abs(DateDiff("d", pi.Product_Interest_Date, l.Lead_Date)) AS Date_Gap
FROM
Test_PI pi
INNER JOIN
Test_Leads l
ON pi.CustomerID = l.CustomerID

returning

CustomerID  Product_Interest_Date  Lead_Date   Date_Gap
---------- --------------------- ---------- --------
1 2014-09-07 2014-09-07 0
1 2014-09-08 2014-09-07 1
1 2014-09-15 2014-09-07 8
1 2014-09-28 2014-09-07 21
1 2014-09-07 2014-09-14 7
1 2014-09-08 2014-09-14 6
1 2014-09-15 2014-09-14 1
1 2014-09-28 2014-09-14 14
1 2014-09-07 2014-09-16 9
1 2014-09-08 2014-09-16 8
1 2014-09-15 2014-09-16 1
1 2014-09-28 2014-09-16 12

Now the query

SELECT 
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date

returns

CustomerID  Product_Interest_Date  MinOfDate_Gap
---------- --------------------- -------------
1 2014-09-07 0
1 2014-09-08 1
1 2014-09-15 1
1 2014-09-28 12

so if we simply join back into the [Date_Gaps] query to get the [Lead_Date]

SELECT
mingap.CustomerID,
mingap.Product_Interest_Date,
Date_Gaps.Lead_Date
FROM
Date_Gaps
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date
) mingap
ON Date_Gaps.CustomerID = mingap.CustomerID
AND Date_Gaps.Product_Interest_Date = mingap.Product_Interest_Date
AND Date_Gaps.Date_Gap = mingap.MinOfDate_Gap

we get

CustomerID  Product_Interest_Date  Lead_Date 
---------- --------------------- ----------
1 2014-09-07 2014-09-07
1 2014-09-08 2014-09-07
1 2014-09-15 2014-09-14
1 2014-09-15 2014-09-16
1 2014-09-28 2014-09-16

Notice that we get two hits for 09-15 because they both have a gap of 1 day (before and after). So, we need to break that tie by wrapping the above query in an aggregation query using Min(Lead_Date) (or Max(Lead_Date), your choice)

SELECT
CustomerID,
Product_Interest_Date,
Min(Lead_Date) AS MinOfLead_Date
FROM
(
SELECT
mingap.CustomerID,
mingap.Product_Interest_Date,
Date_Gaps.Lead_Date
FROM
Date_Gaps
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date
) mingap
ON Date_Gaps.CustomerID = mingap.CustomerID
AND Date_Gaps.Product_Interest_Date = mingap.Product_Interest_Date
AND Date_Gaps.Date_Gap = mingap.MinOfDate_Gap
)
GROUP BY
CustomerID,
Product_Interest_Date

to give us

CustomerID  Product_Interest_Date  MinOfLead_Date
---------- --------------------- --------------
1 2014-09-07 2014-09-07
1 2014-09-08 2014-09-07
1 2014-09-15 2014-09-14
1 2014-09-28 2014-09-16

So now we're ready to JOIN up the original tables

SELECT 
Test_PI.CustomerID,
Test_PI.Product_Interest_Date,
Test_PI.Product_Interest,
Test_Leads.Lead_Date,
Test_Leads.Lead_Source
FROM
(
Test_PI
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Lead_Date) AS MinOfLead_Date
FROM
(
SELECT
mingap.CustomerID,
mingap.Product_Interest_Date,
Date_Gaps.Lead_Date
FROM
Date_Gaps
INNER JOIN
(
SELECT
CustomerID,
Product_Interest_Date,
Min(Date_Gap) AS MinOfDate_Gap
FROM Date_Gaps
GROUP BY
CustomerID,
Product_Interest_Date
) mingap
ON Date_Gaps.CustomerID = mingap.CustomerID
AND Date_Gaps.Product_Interest_Date = mingap.Product_Interest_Date
AND Date_Gaps.Date_Gap = mingap.MinOfDate_Gap
)
GROUP BY
CustomerID,
Product_Interest_Date
) closest
ON Test_PI.CustomerID = closest.CustomerID
AND Test_PI.Product_Interest_Date = closest.Product_Interest_Date
)
INNER JOIN
Test_Leads
ON Test_Leads.CustomerID = closest.CustomerID
AND Test_Leads.Lead_Date = closest.MinOfLead_Date

returning

CustomerID  Product_Interest_Date  Product_Interest  Lead_Date   Lead_Source
---------- --------------------- ---------------- ---------- -----------
1 2014-09-07 Interest1 2014-09-07 Source1
1 2014-09-08 Interest2 2014-09-07 Source1
1 2014-09-15 Interest3 2014-09-14 Source2
1 2014-09-28 Interest4 2014-09-16 Source4

Join tables on nearest date in the past, in MySQL

SELECT a.id, a.sales, a.date, (SELECT TOP 1 Goal 
FROM TableB b WHERE b.date < a.date
ORDER BY b.date DESC) As Goal
FROM TableA a

Going off the nearest date in the past.



Related Topics



Leave a reply



Submit