R: Merge Based on Multiple Conditions (With Non-Equal Criteria)

R: merge based on multiple conditions (with non-equal criteria)

Your data, changing stringsAsFactors=F

DF1 <- data.frame("col1" = rep(c("A","B"), 18),
"col2" = rep(c("C","D","E"), 12),
"value"= (sample(1:100,36)),
"col4" = rep(NA,36),
stringsAsFactors=F)

DF2 <- data.frame("col1" = rep("A",6),
"col2" = rep(c("C","D"),3),
"data" = rep(c(1,3),3),
"min" = seq(0,59,by=10),
"max" = seq(10,69,by=10),
stringsAsFactors=F)

Using dplyr, 1) merge the two data using left_join, 2) check ifelse value is between min and max rowwise, then 3) unselect min and max columns...

library(dplyr)
left_join(DF1, DF2, by=c("col1","col2")) %>%
rowwise() %>%
mutate(data = ifelse(between(value,min,max), data, NA)) %>%
select(-min, -max)

Not sure if you were expecting to perform some kind of aggregation, but here's the output of the above code

    col1  col2 value  col4  data
1 A C 23 NA NA
2 A C 23 NA 1
3 A C 23 NA NA
4 B D 59 NA NA
5 A E 57 NA NA
6 B C 8 NA NA

R: Merge two data frames based on two joining conditions being met

You should be able to throw them in a vector like

survey<-data.frame(name=c("John","John","Jane","Jane"), question=c(1,2,1,2),answer=c("Yes","Yes","Yes", "No"),stringsAsFactors = F)

metaData<-data.frame(first=c("John","John","Jane","Jane"), quest=c(1,2,1,2), age=c("20","20","40", "40"), stringsAsFactors = F)

merge(survey,metaData, by.x=c('name','question'), by.y=c('first','quest'))

name question answer age
1 Jane 1 Yes 40
2 Jane 2 No 40
3 John 1 Yes 20
4 John 2 Yes 20

R: Merge 2 Data Frame by Multiple Condition Using Dates & ID

I've just added a little bit of extra code in @VincentBoned 's answer.

# create 1st dataframe
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))

df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)

# create 2nd dataframe
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")

df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)

library(dplyr)

full_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
select(-condition) %>%
group_by(ID) %>% # for each ID
mutate(Nrows = n()) %>% # count how many rows they have in the final table
ungroup() %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>% # filter out rows with IDs that have more than 1 rows and those rows are not matched
select(-Nrows)

# ID Location startdate enddate N Loss_Date Amt
# 1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
# 2 A2 234B 2014-01-01 2014-08-31 0 <NA> 0
# 3 A3 012A 2015-10-01 2015-12-31 0 <NA> 0
# 4 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500

If you understood how the above code works (step by step) you can use a more compact version that returns the same result:

full_join(df1, df2, by="ID") %>% 
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate),
N = ifelse(condition & !is.na(condition), N, 0),
Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01"),
Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
group_by(ID) %>%
mutate(Nrows = n()) %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>%
select(-c(condition, Nrows))

Matching rows of two dataframes based on multiple conditions

We can create a row index for each of the data frame and then full_join based on Name column. We then select only those rows where Region partially matches and Job and State partially match. We also select rows when both columns have NA values.

library(dplyr)
library(stringr)

df1 %>%
mutate(Column1=row_number()) %>%
full_join(df2 %>% mutate(Column2 = row_number()), by = 'Name') %>%
filter((str_detect(Region.x, fixed(Region.y)) |
(is.na(Region.x) & is.na(Region.y))) &
(str_detect(Job, fixed(State)) | (is.na(Job) & is.na(State)))) %>%
select(Column1, Column2)

# Column1 Column2
#1 1 3
#2 2 2

data

df1 <-  structure(list(Name = c("A", "B", "C"), Region = c("BostonCity", 
"WashingtonD.C.", "NewYork"), State = c("Mass", NA, "NY")), row.names = c(NA,
-3L), class = "data.frame")

df2 <- structure(list(Name = c("C", "B", "A", "D"), Region = c("Boston",
"D.C.", "Boston", "Dallas"), Job = c("Massachusetts", NA, "Massachusetts",
"Texas")), row.names = c(NA, -4L), class = "data.frame")

Merge with multiple conditions and nearest numerical match

I could not work out how to get a working solution using data.table, so I re-thought my approach and have come up with a solution.

First of all I merged the two datasets, and then removed any entries that did not have a stauts of "LAP", this gave me all of the NON Lapsed entries:

NON_LAP <- merge(x=Merged,y=LapsesMonth,by=c("POLICY_NO","LOB_BASE"),all.x=TRUE)

NON_LAP <- NON_LAP [!grepl("LAP", NON_LAP$Status, ignore.case=FALSE),]

Next I merged again, this time looking specifically for the lapsed cases. To work out which was the cloest match I used the abs function, then I ordered by the lowest difference to get the closest matches in order. Finally I removed duplicates to show the closest matches and then also kept duplicates and stripped out the "LAP" status to ensure those that were not the closest match remained in the data.

Finally I merged them all together giving me the required outcome.

FIND_LAP  <- merge(x=Merged,y=LapsesMonth,by=c("POLICY_NO","LOB_BASE"),all.y=FALSE)

FIND_LAP$Difference <- abs(FIND_LAP$GWP - FIND_LAP$ACTUAL_PRICE)

FIND_LAP <- FIND_LAP[order( FIND_LAP[,27] ),]

FOUND_LAP <- FIND_LAP [!duplicated(FIND_LAP[c("POLICY_NO","LOB_BASE")]),]

NOT_LAP <- FIND_LAP [duplicated(FIND_LAP[c("POLICY_NO","LOB_BASE")]),]

Hopefully this will help someone else who might be new to R and encounters the same issue.

Merge data frames based on custom condition - string comparison

This is pretty messy but should do what you're looking for:

First, expand rows for both E values, then group by the key column to check if any values from RHS E are in LHS E. Then filter based on the lookup table.

library(tidyverse)

df3 <- merge(x=df1, y=df2, by='A', all.x = TRUE)

check_rows <- df3 %>%
separate_rows(E.y, sep = ',') %>%
separate_rows(E.x, sep = ',') %>%
mutate(E.x = trimws(E.x),
E.y = trimws(E.y)) %>%
group_by(A) %>%
mutate(check = E.y %in% E.x,
check = ifelse(any(check == TRUE), TRUE, FALSE)) %>%
select(A, check) %>%
unique() %>%
filter(check == TRUE)

df3 <- df3 %>%
filter(A %in% check_rows$A)

Efficiently combining multiple conditions on R data.frame

One option is to use the filter() function in the dplyr package:

library(dplyr)
filter(df, col1=="success" & col2=="success" & col3==1)

You can also use commas (equivalent to &) to separate multiple arguments:

filter(df, col1=="success", col2=="success", col3==1)

merge dataframes based on multiple columns and thresholds

I first turned the city names into character vectors, since (if I understood correctly) you want to include city names that are contained within df2.

df1$city<-as.character(df1$city)
df2$city<-as.character(df2$city)

Then merge them by country:

df = merge(df1, df2, by = ("ctry"))

> df
ctry date.x city.x number col date.y city.y other_number other_col
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue

The library stringr will allow you to see if city.x is within city.y here (see last column):

library(stringr)
df$city_keep<-str_detect(df$city.y,df$city.x) # this returns logical vector if city.x is contained in city.y (works one way)
> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige TRUE
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red TRUE
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue TRUE
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red TRUE
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple FALSE
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black TRUE
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue TRUE

Then you can get the difference in days between dates:

df$dayDiff<-abs(as.POSIXlt(df$date.x)$yday - as.POSIXlt(df$date.y)$yday)

and the difference in numbers:

df$numDiff<-abs(df$number - df$other_number)

Here was what the resulting dataframe looks like:

> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep dayDiff numDiff
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige TRUE 29 1
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE 1 1
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE 0 3
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE 0 3
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red TRUE 0 5
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue TRUE 2 710
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red TRUE 1 5
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple FALSE 0 1
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black TRUE 212 0
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue TRUE 0 3070

But we want to drop things where city.x was not found within city.y, where the day difference is greater than 5 or the number difference is greater than 3:

df<-df[df$dayDiff<=5 & df$numDiff<=3 & df$city_keep==TRUE,]

> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep dayDiff numDiff
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE 1 1
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE 0 3
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE 0 3

What is left are the three rows that you had above (which contained dots in column 1).

Now we can drop the three columns we created, and the date and city from df2:

> df<-subset(df, select=-c(city.y, date.y, city_keep, dayDiff, numDiff))
> df
ctry date.x city.x number col other_number other_col
2 Denmark 1999-06-30 Copenhagen 60 cucumber 61 orange
3 France 1999-06-12 Paris 20 banana 17 green
4 Germany 2003-08-29 Berlin 10 apple 13 yellow


Related Topics



Leave a reply



Submit