R: Merge Based on Multiple Conditions (With Non-Equal Criteria)

R: merge based on multiple conditions (with non-equal criteria)

Your data, changing stringsAsFactors=F

DF1 <- data.frame("col1" = rep(c("A","B"), 18),
              "col2" = rep(c("C","D","E"), 12),
              "value"= (sample(1:100,36)),
              "col4" = rep(NA,36),
              stringsAsFactors=F)

DF2 <- data.frame("col1" = rep("A",6),
              "col2" = rep(c("C","D"),3),
              "data" = rep(c(1,3),3),
              "min" = seq(0,59,by=10),
              "max" = seq(10,69,by=10),
              stringsAsFactors=F)

Using dplyr, 1) merge the two data using left_join, 2) check ifelse value is between min and max rowwise, then 3) unselect min and max columns...

library(dplyr)
left_join(DF1, DF2, by=c("col1","col2")) %>%
  rowwise() %>%
  mutate(data = ifelse(between(value,min,max), data, NA)) %>%
  select(-min, -max)

Not sure if you were expecting to perform some kind of aggregation, but here's the output of the above code

    col1  col2 value  col4  data
 1     A     C    23    NA    NA
 2     A     C    23    NA     1
 3     A     C    23    NA    NA
 4     B     D    59    NA    NA
 5     A     E    57    NA    NA
 6     B     C     8    NA    NA

R: Merge two data frames based on two joining conditions being met

You should be able to throw them in a vector like

survey<-data.frame(name=c("John","John","Jane","Jane"), question=c(1,2,1,2),answer=c("Yes","Yes","Yes", "No"),stringsAsFactors = F)

metaData<-data.frame(first=c("John","John","Jane","Jane"), quest=c(1,2,1,2), age=c("20","20","40", "40"), stringsAsFactors = F)

merge(survey,metaData, by.x=c('name','question'), by.y=c('first','quest'))

  name question answer age
1 Jane        1    Yes  40
2 Jane        2     No  40
3 John        1    Yes  20
4 John        2    Yes  20

R: Merge 2 Data Frame by Multiple Condition Using Dates & ID

I've just added a little bit of extra code in @VincentBoned 's answer.

# create 1st dataframe
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))

df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)

# create 2nd dataframe
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")

df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)

library(dplyr)

full_join(df1, df2, by="ID") %>% 
  mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
  mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
  mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
  mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
  select(-condition) %>%
  group_by(ID) %>%                              # for each ID
  mutate(Nrows = n()) %>%                       # count how many rows they have in the final table
  ungroup() %>%
  filter(!(Nrows > 1 & is.na(Loss_Date))) %>%   # filter out rows with IDs that have more than 1 rows and those rows are not matched
  select(-Nrows)

#   ID Location  startdate    enddate N  Loss_Date  Amt 
# 1 A1     012A 2014-11-01 2014-12-31 2 2014-11-15 2200 
# 2 A2     234B 2014-01-01 2014-08-31 0       <NA>    0 
# 3 A3     012A 2015-10-01 2015-12-31 0       <NA>    0 
# 4 A4     238C 2015-01-01 2015-12-31 2 2015-11-30  500

If you understood how the above code works (step by step) you can use a more compact version that returns the same result:

full_join(df1, df2, by="ID") %>% 
  mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate),
         N = ifelse(condition & !is.na(condition), N, 0),
         Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01"),
         Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
  group_by(ID) %>%                             
  mutate(Nrows = n()) %>%                      
  filter(!(Nrows > 1 & is.na(Loss_Date))) %>%
  select(-c(condition, Nrows))

Matching rows of two dataframes based on multiple conditions

We can create a row index for each of the data frame and then full_join based on Name column. We then select only those rows where Region partially matches and Job and State partially match. We also select rows when both columns have NA values.

library(dplyr)
library(stringr)

df1 %>%
  mutate(Column1=row_number()) %>%
  full_join(df2 %>% mutate(Column2 = row_number()), by = 'Name') %>%
  filter((str_detect(Region.x, fixed(Region.y)) | 
                   (is.na(Region.x) & is.na(Region.y))) & 
       (str_detect(Job, fixed(State)) | (is.na(Job) & is.na(State)))) %>%
   select(Column1, Column2)

#  Column1 Column2
#1       1       3
#2       2       2

data

df1 <-  structure(list(Name = c("A", "B", "C"), Region = c("BostonCity", 
"WashingtonD.C.", "NewYork"), State = c("Mass", NA, "NY")), row.names = c(NA, 
-3L), class = "data.frame")

df2 <- structure(list(Name = c("C", "B", "A", "D"), Region = c("Boston", 
"D.C.", "Boston", "Dallas"), Job = c("Massachusetts", NA, "Massachusetts", 
"Texas")), row.names = c(NA, -4L), class = "data.frame")

Merge with multiple conditions and nearest numerical match

I could not work out how to get a working solution using data.table, so I re-thought my approach and have come up with a solution.

First of all I merged the two datasets, and then removed any entries that did not have a stauts of "LAP", this gave me all of the NON Lapsed entries:

NON_LAP <- merge(x=Merged,y=LapsesMonth,by=c("POLICY_NO","LOB_BASE"),all.x=TRUE)

NON_LAP <- NON_LAP [!grepl("LAP", NON_LAP$Status, ignore.case=FALSE),]

Next I merged again, this time looking specifically for the lapsed cases. To work out which was the cloest match I used the abs function, then I ordered by the lowest difference to get the closest matches in order. Finally I removed duplicates to show the closest matches and then also kept duplicates and stripped out the "LAP" status to ensure those that were not the closest match remained in the data.

Finally I merged them all together giving me the required outcome.

FIND_LAP  <- merge(x=Merged,y=LapsesMonth,by=c("POLICY_NO","LOB_BASE"),all.y=FALSE)

FIND_LAP$Difference <- abs(FIND_LAP$GWP - FIND_LAP$ACTUAL_PRICE)

FIND_LAP  <- FIND_LAP[order( FIND_LAP[,27] ),]

FOUND_LAP <- FIND_LAP [!duplicated(FIND_LAP[c("POLICY_NO","LOB_BASE")]),]

NOT_LAP <- FIND_LAP [duplicated(FIND_LAP[c("POLICY_NO","LOB_BASE")]),]

Hopefully this will help someone else who might be new to R and encounters the same issue.

Merge data frames based on custom condition - string comparison

This is pretty messy but should do what you're looking for:

First, expand rows for both E values, then group by the key column to check if any values from RHS E are in LHS E. Then filter based on the lookup table.

library(tidyverse)

df3 <- merge(x=df1, y=df2, by='A', all.x = TRUE)
      
check_rows <- df3 %>% 
  separate_rows(E.y, sep = ',') %>% 
  separate_rows(E.x, sep = ',') %>% 
  mutate(E.x = trimws(E.x),
         E.y = trimws(E.y)) %>% 
  group_by(A) %>% 
  mutate(check = E.y %in% E.x,
         check = ifelse(any(check == TRUE), TRUE, FALSE)) %>% 
  select(A, check) %>% 
  unique() %>% 
  filter(check == TRUE)

df3 <- df3 %>% 
  filter(A %in% check_rows$A)

Efficiently combining multiple conditions on R data.frame

One option is to use the filter() function in the dplyr package:

library(dplyr)
filter(df, col1=="success" & col2=="success" & col3==1)

You can also use commas (equivalent to &) to separate multiple arguments:

filter(df, col1=="success", col2=="success", col3==1)

merge dataframes based on multiple columns and thresholds

I first turned the city names into character vectors, since (if I understood correctly) you want to include city names that are contained within df2.

df1$city<-as.character(df1$city)
df2$city<-as.character(df2$city)

Then merge them by country:

df = merge(df1, df2, by = ("ctry"))

> df
          ctry     date.x     city.x number      col     date.y      city.y other_number other_col
1      Austria 2002-07-30     Vienna    100   cherry 2002-07-01      Vienna          101     beige
2      Denmark 1999-06-30 Copenhagen     60 cucumber 1999-06-29  Copenhagen           61    orange
3       France 1999-06-12      Paris     20   banana 1999-06-12  East-Paris           17     green
4      Germany 2003-08-29     Berlin     10    apple 2003-08-29      Berlin           13    yellow
5        Italy 1999-02-24       Rome     40   banana 1999-02-24        Rome           45       red
6       Poland 1999-03-16     Warsaw     70    apple 1999-03-14      Warsaw          780      blue
7       Russia 1999-07-16     Moscow     80    peach 1999-07-17      Moscow           85       red
8  Switzerland 2001-04-17       Bern     50    lemon 2001-04-17      Zurich           51    purple
9      Tunisia 2001-08-29      Tunis     90   cherry 2000-01-29       Tunis           90     black
10          UK 2000-08-29     London     30     pear 2000-08-29 near London         3100      blue

The library stringr will allow you to see if city.x is within city.y here (see last column):

library(stringr)
df$city_keep<-str_detect(df$city.y,df$city.x) # this returns logical vector if city.x is contained in city.y (works one way)
> df
          ctry     date.x     city.x number      col     date.y      city.y other_number other_col city_keep
1      Austria 2002-07-30     Vienna    100   cherry 2002-07-01      Vienna          101     beige      TRUE
2      Denmark 1999-06-30 Copenhagen     60 cucumber 1999-06-29  Copenhagen           61    orange      TRUE
3       France 1999-06-12      Paris     20   banana 1999-06-12  East-Paris           17     green      TRUE
4      Germany 2003-08-29     Berlin     10    apple 2003-08-29      Berlin           13    yellow      TRUE
5        Italy 1999-02-24       Rome     40   banana 1999-02-24        Rome           45       red      TRUE
6       Poland 1999-03-16     Warsaw     70    apple 1999-03-14      Warsaw          780      blue      TRUE
7       Russia 1999-07-16     Moscow     80    peach 1999-07-17      Moscow           85       red      TRUE
8  Switzerland 2001-04-17       Bern     50    lemon 2001-04-17      Zurich           51    purple     FALSE
9      Tunisia 2001-08-29      Tunis     90   cherry 2000-01-29       Tunis           90     black      TRUE
10          UK 2000-08-29     London     30     pear 2000-08-29 near London         3100      blue      TRUE

Then you can get the difference in days between dates:

df$dayDiff<-abs(as.POSIXlt(df$date.x)$yday - as.POSIXlt(df$date.y)$yday)

and the difference in numbers:

df$numDiff<-abs(df$number - df$other_number)

Here was what the resulting dataframe looks like:

> df
          ctry     date.x     city.x number      col     date.y      city.y other_number other_col city_keep dayDiff numDiff
1      Austria 2002-07-30     Vienna    100   cherry 2002-07-01      Vienna          101     beige      TRUE      29       1
2      Denmark 1999-06-30 Copenhagen     60 cucumber 1999-06-29  Copenhagen           61    orange      TRUE       1       1
3       France 1999-06-12      Paris     20   banana 1999-06-12  East-Paris           17     green      TRUE       0       3
4      Germany 2003-08-29     Berlin     10    apple 2003-08-29      Berlin           13    yellow      TRUE       0       3
5        Italy 1999-02-24       Rome     40   banana 1999-02-24        Rome           45       red      TRUE       0       5
6       Poland 1999-03-16     Warsaw     70    apple 1999-03-14      Warsaw          780      blue      TRUE       2     710
7       Russia 1999-07-16     Moscow     80    peach 1999-07-17      Moscow           85       red      TRUE       1       5
8  Switzerland 2001-04-17       Bern     50    lemon 2001-04-17      Zurich           51    purple     FALSE       0       1
9      Tunisia 2001-08-29      Tunis     90   cherry 2000-01-29       Tunis           90     black      TRUE     212       0
10          UK 2000-08-29     London     30     pear 2000-08-29 near London         3100      blue      TRUE       0    3070

But we want to drop things where city.x was not found within city.y, where the day difference is greater than 5 or the number difference is greater than 3:

df<-df[df$dayDiff<=5 & df$numDiff<=3 & df$city_keep==TRUE,]

> df
     ctry     date.x     city.x number      col     date.y     city.y other_number other_col city_keep dayDiff numDiff
2 Denmark 1999-06-30 Copenhagen     60 cucumber 1999-06-29 Copenhagen           61    orange      TRUE       1       1
3  France 1999-06-12      Paris     20   banana 1999-06-12 East-Paris           17     green      TRUE       0       3
4 Germany 2003-08-29     Berlin     10    apple 2003-08-29     Berlin           13    yellow      TRUE       0       3

What is left are the three rows that you had above (which contained dots in column 1).

Now we can drop the three columns we created, and the date and city from df2:

> df<-subset(df, select=-c(city.y, date.y, city_keep, dayDiff, numDiff))
> df
     ctry     date.x     city.x number      col other_number other_col
2 Denmark 1999-06-30 Copenhagen     60 cucumber           61    orange
3  France 1999-06-12      Paris     20   banana           17     green
4 Germany 2003-08-29     Berlin     10    apple           13    yellow

R: Merge Based on Multiple Conditions (With Non-Equal Criteria)