R: Fuzzy Match and Between Statements
How about this? We could do the stringdist_inner_join and filter afterwards if the dates are stored as dates. This should be plenty performant for most data, and if not you should probably use data.table instead of fuzzyjoin.
library(fuzzyjoin)
library(dplyr)
table_1$date_1 = as.Date(table_1$date_1)
table_2$date_2 = as.Date(table_2$date_2)
table_2$date_3 = as.Date(table_2$date_3)
stringdist_inner_join(table_1, table_2, by = "id", max_dist = 2) %>%
filter(date_1 >= date_2, date_1 <= date_3)
id.x date_1 id.y date_2 date_3
1 123 2010-01-31 5123 2009-01-31 2011-01-31
2 123 2010-01-31 123 A 2010-01-31 2010-01-31
3 123 2010-01-31 125 2010-01-31 2020-01-31
4 123 2010-01-31 125 2010-01-31 2020-01-31
5 123 2010-01-31 5123 2009-01-31 2011-01-31
6 123 2010-01-31 123 A 2010-01-31 2010-01-31
7 123 2010-01-31 125 2010-01-31 2020-01-31
8 123 2010-01-31 125 2010-01-31 2020-01-31
9 125 2016-01-31 125 2010-01-31 2020-01-31
10 125 2016-01-31 125 2010-01-31 2020-01-31
Combining Multiple Fuzzy Joins
If we want to do this in a loop, loop over the variable part i.e. the by
library(purrr)
library(fuzzyjoin)
library(dplyr)
final2 <- map_dfr(c("id1", "id2"), ~
stringdist_inner_join(table_1, table_2, by = .x, max_dist = 2)) %>%
distinct %>%
arrange(across(everything()))
-checking
> all.equal(final %>%
arrange(across(everything())), final2)
[1] TRUE
SAS: Fuzzy Joins
Push the SQL into the remote database.
proc sql;
connect to netezza .... ;
create table sastable as
select * from connection to netezza
(
select a.*, b.*
from table_a a
inner join table_b b
on (a.date_1 between b.date_2 and b.date_3)
and (le_dst(a.id1, b.id1) = 1 or a.id2 = b.id2)
)
;
quit;
How to fuzzy join 2 dataframes on 2 variables with differing fuzzy logic ?
You can create a cartesian product of two dataframes using merge
and then subset
the rows which follow our required conditions.
subset(merge(a, b, by = NULL), abs(KW.x - KW.y) <= 1 &
abs(price.x - price.y) <= 0.02)
# name.x KW.x price.x KW.y price.y name.y
#1 A 201902 1.99 201903 1.98 a
#5 B 201904 3.02 201904 3.00 b
#9 C 201905 5.00 201904 5.00 c
SQL Fuzzy Join - MSSQL
Here is how this could be done using Levenshtein Distance:
Create this function:(Execute this first)
CREATE FUNCTION ufn_levenshtein(@s1 nvarchar(3999), @s2 nvarchar(3999))
RETURNS int
AS
BEGIN
DECLARE @s1_len int, @s2_len int
DECLARE @i int, @j int, @s1_char nchar, @c int, @c_temp int
DECLARE @cv0 varbinary(8000), @cv1 varbinary(8000)
SELECT
@s1_len = LEN(@s1),
@s2_len = LEN(@s2),
@cv1 = 0x0000,
@j = 1, @i = 1, @c = 0
WHILE @j <= @s2_len
SELECT @cv1 = @cv1 + CAST(@j AS binary(2)), @j = @j + 1
WHILE @i <= @s1_len
BEGIN
SELECT
@s1_char = SUBSTRING(@s1, @i, 1),
@c = @i,
@cv0 = CAST(@i AS binary(2)),
@j = 1
WHILE @j <= @s2_len
BEGIN
SET @c = @c + 1
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j-1, 2) AS int) +
CASE WHEN @s1_char = SUBSTRING(@s2, @j, 1) THEN 0 ELSE 1 END
IF @c > @c_temp SET @c = @c_temp
SET @c_temp = CAST(SUBSTRING(@cv1, @j+@j+1, 2) AS int)+1
IF @c > @c_temp SET @c = @c_temp
SELECT @cv0 = @cv0 + CAST(@c AS binary(2)), @j = @j + 1
END
SELECT @cv1 = @cv0, @i = @i + 1
END
RETURN @c
END
(Function developped by Joseph Gama)
And then simply use this query to get matches
SELECT A.Customer,
b.ID,
b.Customer
FROM #POTENTIALCUSTOMERS a
LEFT JOIN #ExistingCustomers b ON dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) < 5;
Complete Script after you create that function:
IF OBJECT_ID('tempdb..#ExistingCustomers') IS NOT NULL
DROP TABLE #ExistingCustomers;
CREATE TABLE #ExistingCustomers
(Customer VARCHAR(255),
ID INT
);
INSERT INTO #ExistingCustomers
VALUES
('Ed''s Barbershop',
1002
);
INSERT INTO #ExistingCustomers
VALUES
('GroceryTown',
1003
);
INSERT INTO #ExistingCustomers
VALUES
('Candy Place',
1004
);
INSERT INTO #ExistingCustomers
VALUES
('Handy Man',
1005
);
IF OBJECT_ID('tempdb..#POTENTIALCUSTOMERS') IS NOT NULL
DROP TABLE #POTENTIALCUSTOMERS;
CREATE TABLE #POTENTIALCUSTOMERS(Customer VARCHAR(255));
INSERT INTO #POTENTIALCUSTOMERS
VALUES('Eds Barbershop');
INSERT INTO #POTENTIALCUSTOMERS
VALUES('Grocery Town');
INSERT INTO #POTENTIALCUSTOMERS
VALUES('Candy Place');
INSERT INTO #POTENTIALCUSTOMERS
VALUES('Handee Man');
INSERT INTO #POTENTIALCUSTOMERS
VALUES('Beauty Salon');
INSERT INTO #POTENTIALCUSTOMERS
VALUES('The Apple Farm');
INSERT INTO #POTENTIALCUSTOMERS
VALUES('Igloo Ice Cream');
INSERT INTO #POTENTIALCUSTOMERS
VALUES('Ride-a-Long Bikes');
SELECT A.Customer,
b.ID,
b.Customer
FROM #POTENTIALCUSTOMERS a
LEFT JOIN #ExistingCustomers b ON dbo.ufn_levenshtein(REPLACE(A.Customer, ' ', ''), REPLACE(B.Customer, ' ', '')) < 5;
Here you can find a T-SQL example at http://www.kodyaz.com/articles/fuzzy-string-matching-using-levenshtein-distance-sql-server.aspx
fuzzy LEFT join with R
Voila :)
fuzzy_left_join(df1, df2, match_fun = ci_str_detect, by = c(col1 = "col4"))
Related Topics
SQL Table Aliases - Good or Bad
How to Include Excluded Rows in Returning from Insert ... on Conflict
Ora-01861: Literal Does Not Match Format String
SQL Server Row_Number() on SQL Server 2000
Extract Numbers from a Text in SQL Server
SQL Same Unit Between Two Tables Needs Order Numbers in 1 Cell
How to Find a Default Constraint Using Information_Schema
Foreign Key Referencing a 2 Columns Primary Key in SQL Server
What Do You Do in SQL Server to Create or Alter
How to Get the Last Day of Month in Postgres
How to Detect If a String Contains Special Characters
How to Dynamically Use Tg_Table_Name in Postgresql 8.2
SQL in Query Produces Strange Result