R: merge based on multiple conditions (with non-equal criteria)
Your data, changing stringsAsFactors=F
DF1 <- data.frame("col1" = rep(c("A","B"), 18),
"col2" = rep(c("C","D","E"), 12),
"value"= (sample(1:100,36)),
"col4" = rep(NA,36),
stringsAsFactors=F)
DF2 <- data.frame("col1" = rep("A",6),
"col2" = rep(c("C","D"),3),
"data" = rep(c(1,3),3),
"min" = seq(0,59,by=10),
"max" = seq(10,69,by=10),
stringsAsFactors=F)
Using dplyr
, 1) merge the two data using left_join
, 2) check ifelse
value
is between
min
and max
rowwise
, then 3) unselect min
and max
columns...
library(dplyr)
left_join(DF1, DF2, by=c("col1","col2")) %>%
rowwise() %>%
mutate(data = ifelse(between(value,min,max), data, NA)) %>%
select(-min, -max)
Not sure if you were expecting to perform some kind of aggregation, but here's the output of the above code
col1 col2 value col4 data
1 A C 23 NA NA
2 A C 23 NA 1
3 A C 23 NA NA
4 B D 59 NA NA
5 A E 57 NA NA
6 B C 8 NA NA
R: Merge two data frames based on two joining conditions being met
You should be able to throw them in a vector like
survey<-data.frame(name=c("John","John","Jane","Jane"), question=c(1,2,1,2),answer=c("Yes","Yes","Yes", "No"),stringsAsFactors = F)
metaData<-data.frame(first=c("John","John","Jane","Jane"), quest=c(1,2,1,2), age=c("20","20","40", "40"), stringsAsFactors = F)
merge(survey,metaData, by.x=c('name','question'), by.y=c('first','quest'))
name question answer age
1 Jane 1 Yes 40
2 Jane 2 No 40
3 John 1 Yes 20
4 John 2 Yes 20
R: Merge 2 Data Frame by Multiple Condition Using Dates & ID
I've just added a little bit of extra code in @VincentBoned 's answer.
# create 1st dataframe
ID<- c("A1", "A2","A3", "A4")
Location <- c("012A","234B","012A","238C" )
startdate <- as.Date(c("2014-11-01","2014-01-01","2015-10-01", "2015-01-01"))
enddate <- as.Date(c("2014-12-31","2014-08-31","2015-12-31","2015-12-31"))
df1<- data.frame(ID,Location, startdate, enddate, stringsAsFactors = F)
# create 2nd dataframe
ID<-c("A1", "A1", "A4")
N<- c(2,1,2)
Loss_Date <- as.Date(c("2014-11-15", "2015-12-25", "2015-11-30"))
Amt<-c("2200","1000", "500")
df2<- data.frame(ID, N, Loss_Date,Amt, stringsAsFactors = F)
library(dplyr)
full_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate)) %>%
mutate(N = ifelse(condition & !is.na(condition), N, 0)) %>%
mutate(Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01")) %>%
mutate(Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
select(-condition) %>%
group_by(ID) %>% # for each ID
mutate(Nrows = n()) %>% # count how many rows they have in the final table
ungroup() %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>% # filter out rows with IDs that have more than 1 rows and those rows are not matched
select(-Nrows)
# ID Location startdate enddate N Loss_Date Amt
# 1 A1 012A 2014-11-01 2014-12-31 2 2014-11-15 2200
# 2 A2 234B 2014-01-01 2014-08-31 0 <NA> 0
# 3 A3 012A 2015-10-01 2015-12-31 0 <NA> 0
# 4 A4 238C 2015-01-01 2015-12-31 2 2015-11-30 500
If you understood how the above code works (step by step) you can use a more compact version that returns the same result:
full_join(df1, df2, by="ID") %>%
mutate(condition = (Loss_Date >= startdate & Loss_Date <= enddate),
N = ifelse(condition & !is.na(condition), N, 0),
Loss_Date = as.Date(ifelse(condition, Loss_Date, NA),origin="1970-01-01"),
Amt = ifelse(condition & !is.na(condition), Amt, 0)) %>%
group_by(ID) %>%
mutate(Nrows = n()) %>%
filter(!(Nrows > 1 & is.na(Loss_Date))) %>%
select(-c(condition, Nrows))
Matching rows of two dataframes based on multiple conditions
We can create a row index for each of the data frame and then full_join
based on Name
column. We then select only those rows where Region
partially matches and Job
and State
partially match. We also select rows when both columns have NA
values.
library(dplyr)
library(stringr)
df1 %>%
mutate(Column1=row_number()) %>%
full_join(df2 %>% mutate(Column2 = row_number()), by = 'Name') %>%
filter((str_detect(Region.x, fixed(Region.y)) |
(is.na(Region.x) & is.na(Region.y))) &
(str_detect(Job, fixed(State)) | (is.na(Job) & is.na(State)))) %>%
select(Column1, Column2)
# Column1 Column2
#1 1 3
#2 2 2
data
df1 <- structure(list(Name = c("A", "B", "C"), Region = c("BostonCity",
"WashingtonD.C.", "NewYork"), State = c("Mass", NA, "NY")), row.names = c(NA,
-3L), class = "data.frame")
df2 <- structure(list(Name = c("C", "B", "A", "D"), Region = c("Boston",
"D.C.", "Boston", "Dallas"), Job = c("Massachusetts", NA, "Massachusetts",
"Texas")), row.names = c(NA, -4L), class = "data.frame")
Merge with multiple conditions and nearest numerical match
I could not work out how to get a working solution using data.table, so I re-thought my approach and have come up with a solution.
First of all I merged the two datasets, and then removed any entries that did not have a stauts of "LAP", this gave me all of the NON Lapsed entries:
NON_LAP <- merge(x=Merged,y=LapsesMonth,by=c("POLICY_NO","LOB_BASE"),all.x=TRUE)
NON_LAP <- NON_LAP [!grepl("LAP", NON_LAP$Status, ignore.case=FALSE),]
Next I merged again, this time looking specifically for the lapsed cases. To work out which was the cloest match I used the abs function, then I ordered by the lowest difference to get the closest matches in order. Finally I removed duplicates to show the closest matches and then also kept duplicates and stripped out the "LAP" status to ensure those that were not the closest match remained in the data.
Finally I merged them all together giving me the required outcome.
FIND_LAP <- merge(x=Merged,y=LapsesMonth,by=c("POLICY_NO","LOB_BASE"),all.y=FALSE)
FIND_LAP$Difference <- abs(FIND_LAP$GWP - FIND_LAP$ACTUAL_PRICE)
FIND_LAP <- FIND_LAP[order( FIND_LAP[,27] ),]
FOUND_LAP <- FIND_LAP [!duplicated(FIND_LAP[c("POLICY_NO","LOB_BASE")]),]
NOT_LAP <- FIND_LAP [duplicated(FIND_LAP[c("POLICY_NO","LOB_BASE")]),]
Hopefully this will help someone else who might be new to R and encounters the same issue.
Merge data frames based on custom condition - string comparison
This is pretty messy but should do what you're looking for:
First, expand rows for both E
values, then group by the key column to check if any values from RHS E
are in LHS E
. Then filter based on the lookup table.
library(tidyverse)
df3 <- merge(x=df1, y=df2, by='A', all.x = TRUE)
check_rows <- df3 %>%
separate_rows(E.y, sep = ',') %>%
separate_rows(E.x, sep = ',') %>%
mutate(E.x = trimws(E.x),
E.y = trimws(E.y)) %>%
group_by(A) %>%
mutate(check = E.y %in% E.x,
check = ifelse(any(check == TRUE), TRUE, FALSE)) %>%
select(A, check) %>%
unique() %>%
filter(check == TRUE)
df3 <- df3 %>%
filter(A %in% check_rows$A)
Efficiently combining multiple conditions on R data.frame
One option is to use the filter()
function in the dplyr package:
library(dplyr)
filter(df, col1=="success" & col2=="success" & col3==1)
You can also use commas (equivalent to &) to separate multiple arguments:
filter(df, col1=="success", col2=="success", col3==1)
merge dataframes based on multiple columns and thresholds
I first turned the city names into character vectors, since (if I understood correctly) you want to include city names that are contained within df2.
df1$city<-as.character(df1$city)
df2$city<-as.character(df2$city)
Then merge them by country:
df = merge(df1, df2, by = ("ctry"))
> df
ctry date.x city.x number col date.y city.y other_number other_col
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue
The library stringr
will allow you to see if city.x is within city.y here (see last column):
library(stringr)
df$city_keep<-str_detect(df$city.y,df$city.x) # this returns logical vector if city.x is contained in city.y (works one way)
> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige TRUE
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red TRUE
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue TRUE
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red TRUE
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple FALSE
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black TRUE
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue TRUE
Then you can get the difference in days between dates:
df$dayDiff<-abs(as.POSIXlt(df$date.x)$yday - as.POSIXlt(df$date.y)$yday)
and the difference in numbers:
df$numDiff<-abs(df$number - df$other_number)
Here was what the resulting dataframe looks like:
> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep dayDiff numDiff
1 Austria 2002-07-30 Vienna 100 cherry 2002-07-01 Vienna 101 beige TRUE 29 1
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE 1 1
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE 0 3
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE 0 3
5 Italy 1999-02-24 Rome 40 banana 1999-02-24 Rome 45 red TRUE 0 5
6 Poland 1999-03-16 Warsaw 70 apple 1999-03-14 Warsaw 780 blue TRUE 2 710
7 Russia 1999-07-16 Moscow 80 peach 1999-07-17 Moscow 85 red TRUE 1 5
8 Switzerland 2001-04-17 Bern 50 lemon 2001-04-17 Zurich 51 purple FALSE 0 1
9 Tunisia 2001-08-29 Tunis 90 cherry 2000-01-29 Tunis 90 black TRUE 212 0
10 UK 2000-08-29 London 30 pear 2000-08-29 near London 3100 blue TRUE 0 3070
But we want to drop things where city.x was not found within city.y, where the day difference is greater than 5 or the number difference is greater than 3:
df<-df[df$dayDiff<=5 & df$numDiff<=3 & df$city_keep==TRUE,]
> df
ctry date.x city.x number col date.y city.y other_number other_col city_keep dayDiff numDiff
2 Denmark 1999-06-30 Copenhagen 60 cucumber 1999-06-29 Copenhagen 61 orange TRUE 1 1
3 France 1999-06-12 Paris 20 banana 1999-06-12 East-Paris 17 green TRUE 0 3
4 Germany 2003-08-29 Berlin 10 apple 2003-08-29 Berlin 13 yellow TRUE 0 3
What is left are the three rows that you had above (which contained dots in column 1).
Now we can drop the three columns we created, and the date and city from df2:
> df<-subset(df, select=-c(city.y, date.y, city_keep, dayDiff, numDiff))
> df
ctry date.x city.x number col other_number other_col
2 Denmark 1999-06-30 Copenhagen 60 cucumber 61 orange
3 France 1999-06-12 Paris 20 banana 17 green
4 Germany 2003-08-29 Berlin 10 apple 13 yellow
Related Topics
How to Control Ggplot's Plotting Area Proportions Instead of Fitting Them to Devices in R
How to Ddply() Without Sorting
Setting Midpoint for Continuous Diverging Color Scale on a Heatmap
R - Count Shiny Download Button Clicks
How to Count the Observations Falling in Each Node of a Tree
R: Adding Alpha Bags to a 2D or 3D Scatterplot
How to Change the Order of the Panels in Simple Lattice Graphs
Plot Probability Heatmap/Hexbin with Different Sized Bins
Set Upper Limit in Ggplot to Include Label Greater Than the Maximum Value
How to Select All Unique Combinations of Two Columns in an R Data Frame
Replace Numbers in Data Frame Column in R
Run Asynchronous Function in R
How to Find Previous Sunday in R
How to Install the R Package Rgl on Ubuntu 9.10, Using R Version 2.12.1
Want Only the Time Portion of a Date-Time Object in R