Merge Dataframes on Matching A, B and *Closest* C

Merge dataframes on matching A, B and closest C?

Using merge couple of times and aggregate once, here is how to do it.

set.seed(1)
df <- cbind(expand.grid(x = 1:3, y = 1:5), time = round(runif(15) * 30))
to.merge <- data.frame(x = c(2, 2, 2, 3, 2), y = c(1, 1, 1, 5, 4), time = c(17, 12, 11.6, 22.5, 2), val = letters[1:5], stringsAsFactors = F)

#Find rows that match by x and y
res <- merge(to.merge, df, by = c("x", "y"), all.x = TRUE)
res$dif <- abs(res$time.x - res$time.y)
res
##   x y time.x val time.y dif
## 1 2 1   17.0   a     11 6.0
## 2 2 1   12.0   b     11 1.0
## 3 2 1   11.6   c     11 0.6
## 4 2 4    2.0   e      6 4.0
## 5 3 5   22.5   d     23 0.5

#Find rows that need to be merged
res1 <- merge(aggregate(dif ~ x + y, data = res, FUN = min), res)
res1
##   x y dif time.x val time.y
## 1 2 1 0.6   11.6   c     11
## 2 2 4 4.0    2.0   e      6
## 3 3 5 0.5   22.5   d     23

#Finally merge the result back into df
final <- merge(df, res1[res1$dif <= 1, c("x", "y", "val")], all.x = TRUE)
final
##    x y time  val
## 1  1 1    8 <NA>
## 2  1 2   27 <NA>
## 3  1 3   28 <NA>
## 4  1 4    2 <NA>
## 5  1 5   21 <NA>
## 6  2 1   11    c
## 7  2 2    6 <NA>
## 8  2 3   20 <NA>
## 9  2 4    6 <NA>
## 10 2 5   12 <NA>
## 11 3 1   17 <NA>
## 12 3 2   27 <NA>
## 13 3 3   19 <NA>
## 14 3 4    5 <NA>
## 15 3 5   23    d

merging data frames based on multiple nearest matches in R

Without knowing exactly how you want the result formatted, you can do this with the data.table rolling join with roll="nearest" that you mentioned.

In this case I've melted both sets of data to long datasets so that the matching can be done in a single join.

library(data.table)
setDT(df1)
setDT(df2)

df1[
    match(
        melt(df1, id.vars="julian")[
            melt(df2, measure.vars=names(df2)),
            on=c("variable","value"), roll="nearest"]$julian,
        julian),
]
#   julian        a        b         c         d
#1:      9 12.02948 13.54714  7.659482  6.784113
#2:     20 28.74620 20.24871 18.523935 17.801711
#3:     10 13.00511 14.57352  8.296155  6.942622
#4:     24 30.26931 24.20554 20.253149 22.017714

If you want separate tables for each join instead you could do something like:

lapply(names(df2), \(var)  df1[df2, on=var, roll="nearest", .SD, .SDcols=names(df1)] )

Merging on closest value Pandas

merge_asof() does work. matches your expected output
have not made assumptions about sorting, so did sort_values()
for transparency include right Meter_indication in output

df1 = pd.read_csv(io.StringIO("""Meter_indication      Fuel1
1180784               275
1181278               280
1181791               300
1182285               280
1182801               300
1183295               280
1183717               250"""), sep="\s+")

df2 = pd.read_csv(io.StringIO("""Meter_indication      Fuel2
1180785               278
1181282               282
1181800               310
1182401               282
1182824               320
1183310               215
1183727               250"""), sep="\s+")

pd.merge_asof(
    df1.sort_values("Meter_indication"),
    df2.sort_values("Meter_indication").assign(mi=lambda d: d["Meter_indication"]),
    on="Meter_indication",
    direction="nearest",
)

Meter_indication	Fuel1	Fuel2	mi
1180784	275	278	1180785
1181278	280	282	1181282
1181791	300	310	1181800
1182285	280	282	1182401
1182801	300	320	1182824
1183295	280	215	1183310
1183717	250	250	1183727

Matching two dataframes based on a common ID and the closest timestamp (less than 5 min) in R

Using the dfata frames shown reproducibly in the Note at the end, perform a left join using the indicated condition grouping by dat1 rows and taking the minimum seconds difference over the matched rows. Remove the seconds difference column (4) at the end. Note that there are 60 * 5 seconds in 5 minutes.

library(sqldf)

out <- sqldf("select a.ID, 
              a.timestamp, 
              b.timestamp [timestamp.y], 
              min(abs(a.timestamp - b.timestamp)) seconds, 
              b.x1, 
              b.x2
  from dat1 a
  left join dat2 b on a.ID = b.ID and
                      abs(a.timestamp - b.timestamp) < 60 * 5
  group by a.rowid")[-4]
out$timestamp.y <- as.POSIXct(out$timestamp.y, origin = "1970-01-01")

# check
all.equal(out, target)
## [1] TRUE

Note

Assume the inputs and target below shown reproducibly. Note that the timestamp columns have POSIXct class.

dat1 <-
structure(list(ID = 1:3, timestamp = structure(c(1603708643, 
1603712596, 1603719412), class = c("POSIXct", "POSIXt"), tzone = "")), 
row.names = c(NA, -3L), class = "data.frame")

dat2 <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 3L, 3L, 3L, 3L), 
timestamp = structure(c(1603713323, 1603708763, 1603715319, 
1603724172, 1603751992, 1603719472, 1603721777, 
1603799685, 1603801544), class = c("POSIXct", "POSIXt"), tzone = ""), 
    x1 = c("a", "b", "c", "d", "e", "a", "b", "c", "d"), x2 = c("c", 
    "b", "e", "a", "e", "a", "f", "d", "a")), row.names = c(NA, 
-9L), class = "data.frame")

target <-
structure(list(ID = 1:3, timestamp = structure(c(1603708643, 
1603712596, 1603719412), class = c("POSIXct", "POSIXt")), 
timestamp.y = structure(c(1603708763, 
NA, 1603719472), class = c("POSIXct", "POSIXt")), 
    x1 = c("b", NA, "a"), x2 = c("b", NA, "a")), row.names = c(NA, 
-3L), class = "data.frame")

Merge dataframes on nearest datetime / timestamp

You can use reindex with method='nearest' and then merge:

A['date'] = pd.to_datetime(A.date)
B['date'] = pd.to_datetime(B.date)
A.sort_values('date', inplace=True)
B.sort_values('date', inplace=True)

B1 = B.set_index('date').reindex(A.set_index('date').index, method='nearest').reset_index()
print (B1)

print (pd.merge(A,B1, on='date'))
  ID_x       date ID_y value
0    B 1991-01-01    B     7
1    B 1999-08-02    C     1
2    A 2014-06-22    B     8
3    A 2014-07-02    A     5
4    C 2015-01-01    A     3

You can also add parameter suffixes:

print (pd.merge(A,B1, on='date', suffixes=('_', '')))
  ID_       date ID value
0   B 1991-01-01  B     7
1   B 1999-08-02  C     1
2   A 2014-06-22  B     8
3   A 2014-07-02  A     5
4   C 2015-01-01  A     3

Matching nearest values in two dataframes of different lengths

Using KDTree, you can find the closest math in df1 in m O(log n) which n is the number of elements in df2 and m number of elements in df1.

import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

df1 = pd.DataFrame({'a':np.array([1.2345,2.2345,3.2345]),'b':np.array([4.123,5.123,6.123])})
df2 = pd.DataFrame({'A':np.array([1.2346,2.2343]),'B':np.array([4.1232,5.1239])})

def spatial_merge_NN(df1, df2, xyz=['A', 'B']):
    ''' Add features from df2 to df1, taking closest point '''
    tree = cKDTree(df2[xyz].values)
    dists, indices = tree.query(df1[['a','b']].values, k=1)
    fts = [c for c in df2.columns]
    for c in fts:
        df1[c] = df2[c].values[indices]
    return df1

df_new = spatial_merge_NN(df1, df2, ['A', 'B'])
#         a      b       A       B
# 0  1.2345  4.123  1.2346  4.1232
# 1  2.2345  5.123  2.2343  5.1239
# 2  3.2345  6.123  2.2343  5.1239

It put one dataframe constant ( in this case df1) and iterate through df2 and find the closest pair from d2 and add that row.

Merge Dataframes on Matching A, B and Closest C