In R, Merge Two Data Frames, Fill Down The Blanks

In R, Merge two data frames, fill down the blanks

It sounds like you need na.locf from the package zoo (stands for last observation carried forward):

library(zoo)
tbl <- merge(small.table, big.table, by = "idx", all.y=TRUE)
tbl$color2 <- na.locf(tbl$color,na.rm = FALSE)

Merge uneven data frames in R by common column and fill in empty elements by NA

merge function works fine for me. As both dataframes have the same column name X, it can be used to merge using by.

claims = data.frame(X = c(10,24,30,35,64,104), 
                    TransactionDateTime = c('JUL-15','APR-17','SEP-15','JUL-15','APR-16','SEP-15'))
claims
#     X TransactionDateTime
# 1  10              JUL-15
# 2  24              APR-17
# 3  30              SEP-15
# 4  35              JUL-15
# 5  64              APR-16
# 6 104              SEP-15

lastaction = data.frame(X = c(10,24,30,35,40,57), lastvalue = c(6,1,4,6,6,1), 
                        Approvalmonth = c('15-OCT','17-JAN','16-MAR','15-OCT','15-SEP','17-JUN'), 
                        lastvalue = c(0,1,0,0,0,1)) 
lastaction
#    X lastvalue Approvalmonth lastvalue
# 1 10         6        15-OCT         0
# 2 24         1        17-JAN         1
# 3 30         4        16-MAR         0
# 4 35         6        15-OCT         0
# 5 40         6        15-SEP         0
# 6 57         1        17-JUN         1

merge(claims, lastaction, by = "X", all = TRUE)
#     X TransactionDateTime lastvalue Approvalmonth lastvalue.1
# 1  10              JUL-15         6        15-OCT           0
# 2  24              APR-17         1        17-JAN           1
# 3  30              SEP-15         4        16-MAR           0
# 4  35              JUL-15         6        15-OCT           0
# 5  40                <NA>         6        15-SEP           0
# 6  57                <NA>         1        17-JUN           1
# 7  64              APR-16        NA          <NA>          NA
# 8 104              SEP-15        NA          <NA>          NA

dplyr's full_join as well works

dplyr::full_join(claims, lastaction, by = 'X')
    X TransactionDateTime lastvalue Approvalmonth lastvalue.y
1  10              JUL-15         6        15-OCT           6
2  24              APR-17         1        17-JAN           1
3  30              SEP-15         4        16-MAR           4
4  35              JUL-15         6        15-OCT           6
5  64              APR-16        NA          <NA>          NA
6 104              SEP-15        NA          <NA>          NA
7  40                <NA>         6        15-SEP           6
8  57                <NA>         1        17-JUN           1

matching and filling in blanks of data frame in R

I am not sure how efficient this is, but this seems to be working. I am using 3 custom functions.

This would be easy to adapt to have it not match if there are two many NA's in a row and other cases such as that. The main trick is in f_compare() to build two dataframes representing each row combination to compare against rather than looping.

Functions:

f_compare()

Compare each row to itself. We use combn() to develop all the unique row combinations. This function will return a matrix with two columns. These two columns specify row numbers which are "duplicates" - defined as ignoring the first column and by counting NA as a match.

Edit: Extended the logic to force a minimum number of fields to actually match by value rather than from an NA wildcard. We require the number of TRUE match values in the row plus the number of NA values in the row to equal the total number of fields.

Edit2: Added a check to see if a record is "bad". If there are matched pairs (a, b) and (c, b), it checks to see if (a, c) or (c, a) is a pair as well. If not, then the offending record c is dropped from the database. Again, this could fairly easily be adapted to flag rather than drop the row.

f_compare <- function(dat, .min_match = 1, .exclude_cols = c(1)) {
  
  # grid of unique row id combinations
  dat_rows <- t(combn(seq_len(nrow(dat)), 2))
  
  # grid of all row id combinations (e.g., (1, 2) and (2, 1))
  dat_rows_all <- expand.grid(seq_len(nrow(dat)), seq_len(nrow(dat)))
  dat_rows_all <- dat_rows_all[dat_rows_all[,1] != dat_rows_all[,2], ]
  
  # function to find record matches based on a grid specification
  f_match <- function(dat, dat_rows, .min_match, .exclude_cols) {
    
    compare <- dat[dat_rows[, 1], -.exclude_cols] == dat[dat_rows[, 2], -.exclude_cols]
    
    row_true <- rowSums(compare, na.rm = TRUE)
    row_na <- rowSums(is.na(compare))
    
    which_rows <- which((row_true >= .min_match) & (row_true + row_na == ncol(compare)))
    rbind(dat_rows[which_rows,])
    
  }
  
  # matches for each grid
  match_rows <- f_match(dat, dat_rows, .min_match, .exclude_cols)
  match_rows_all <- f_match(dat, dat_rows_all, .min_match, .exclude_cols)
  
  # function to determine if it is a "bad" record
  f_bad <- function(check_index, id_comb, id_all) {
    
    if (length(id_comb[id_comb[,2] == check_index, 1]) > 1) {
      trans_rows <- t(combn(id_comb[id_comb[,2] == check_index, 1], 2))
    
      compare_trans <- id_all[rep(seq_len(nrow(id_all)), times = nrow(trans_rows)),] == trans_rows[rep(seq_len(nrow(trans_rows)), each = nrow(id_all)),]
    
      return(!any(rowSums(compare_trans) == ncol(compare_trans)))
    } else {
      return(FALSE)
    }
  }
  
  # check all rows with a potential match to see if it is "bad"
  check_ids <- unique(match_rows[,2])
  
  if (length(check_ids) > 0) {
    bad_ids <- check_ids[sapply(check_ids, f_bad, match_rows, match_rows_all)]
  } else {
    bad_ids = check_ids
  }
  
  list(id = rbind(match_rows[!(match_rows[,2] %in% bad_ids), ]), bad_id = bad_ids)

}

f_merge()

Given two vectors, x and y, fill in NA slots in each with values from the other.

f_merge <- function(id, dat) {
  x <- dat[id[1],]
  y <- dat[id[2],]
  
  y[is.na(y)] <- x[is.na(y)]
  x[is.na(x)] <- y[is.na(x)]
  
  x
}

merge_records()

Recursively work through the data set until there are no more duplicates left to merge. There is some case logic in here to account for things like R converting a single row matrix to a vector and when to exit the recursion.

Edit2: Modified the merge to drop "bad" records.

merge_records <- function(dat) {
  merge_id <- f_compare(dat)
  
  # drop bad rows
  if (length(merge_id$bad_id) > 0) {
    dat <- dat[-merge_id$bad_id,]
  }
  
  dat2 <- do.call("rbind", apply(merge_id$id, 1, f_merge, dat = dat))
  dat2 <- rbind(dat2, dat[which(!(seq_len(nrow(dat)) %in% c(merge_id$id))), ])
  
  if (nrow(dat2) > 1) {
    dat2 <- dat2[which(!(duplicated(dat2))),]
    
    if (nrow(f_compare(dat2)$id) > 0) merge_records(dat2) else return(dat2)
  } else {
    dat2
  }
  
}

End Result:

merge_records(df)

    id first      last birthyear father
1  a12 linda      john      1991    dan
21 3n8   max      well      1915   mike
5  1y9   pam degeneres      1855   <NA>
6  84z  <NA> degeneres      1950   hank

Merge two data frames to fill in missing dates

One way, with dplyr:

library(dplyr)
df3 <- df1 %>% filter(year < 1920) %>% 
               left_join(filter(df2, year == 1910) %>% select(-year))
df3 <- df1 %>% filter(year >= 1920) %>% 
               left_join(filter(df2, year == 1920) %>% select(-year)) %>% 
               bind_rows(df3) %>% 
               arrange(year, state)

It's split into two chains, one that just joins the pre-1920 data, the other which does the post-1920, joins the two, and sorts.

Update based on comments:

To split the years into 5-year increments and join on df2 values in those increments:

df1$year_factor <- cut(df1$year, seq(1900, 1950, 5), right = FALSE)
df2$year_factor <- cut(df2$year, seq(1900, 1950, 5), right = FALSE)
df3 <- df1 %>% left_join(select(df2, -year)) %>% select(-year_factor)

This is actually simpler, but it introduces (and removes) a dummy variable, and cut can be a little finicky; play with it as you like. It produces:

   year      state acre_yield          w
1  1910   colorado       15.5 0.11777361
2  1910     kansas         19 0.33202730
3  1910 new mexico         15 0.01760644
4  1910   oklahoma         16 0.49216919
5  1910      texas         22 0.04042345
6  1911   colorado         14 0.11777361
7  1911     kansas       14.5 0.33202730
8  1911 new mexico       19.5 0.01760644
9  1911   oklahoma          7 0.49216919
10 1911      texas         11 0.04042345
11 1919      texas         23         NA
12 1920   colorado       18.5 0.30557449
13 1920     kansas       26.2 0.32107132
14 1920 new mexico         20 0.05836014
15 1920   oklahoma         26 0.26414535
16 1920      texas         20 0.05084870
17 1921   colorado         12 0.30557449
18 1921     kansas       22.8 0.32107132
19 1921 new mexico       19.5 0.05836014
20 1921   oklahoma         23 0.26414535
21 1921      texas         18 0.05084870

Note the one NA value for the 1919 row; since df2 doesn't have any values between 1915 and 1919, there's nothing to insert. To go by decades, change the 5 in seq to 10, or otherwise set as you prefer.

Merge unequal dataframes and replace missing rows with 0

Take a look at the help page for merge. The all parameter lets you specify different types of merges. Here we want to set all = TRUE. This will make merge return NA for the values that don't match, which we can update to 0 with is.na():

zz <- merge(df1, df2, all = TRUE)
zz[is.na(zz)] <- 0

> zz
  x y
1 a 0
2 b 1
3 c 0
4 d 0
5 e 0

Updated many years later to address follow up question

You need to identify the variable names in the second data table that you aren't merging on - I use setdiff() for this. Check out the following:

df1 = data.frame(x=c('a', 'b', 'c', 'd', 'e', NA))
df2 = data.frame(x=c('a', 'b', 'c'),y1 = c(0,1,0), y2 = c(0,1,0))

#merge as before
df3 <- merge(df1, df2, all = TRUE)
#columns in df2 not in df1
unique_df2_names <- setdiff(names(df2), names(df1))
df3[unique_df2_names][is.na(df3[, unique_df2_names])] <- 0

^{Created on 2019-01-03 by the reprex package (v0.2.1)}

Combining data frames in R without overwriting data with blanks

If we want to overwrite, create a logical matrix and use that to assign

i1 <- df_all_data == 'outlier' & !is.na(df_all_data)
df_all_data[i1] <- df_outliers[i1]

How to merge dataframes in R when empty strings exist

Assuming that the order of these your two data frames is going to be stable, this can be done just by selecting the missing rows and filling them with the corresponding non-empty rows from the other one.

file3 <- file1
file3[file3$date == "", ] <- file2[file2$date != "", ]
file3

# 
# date   X
# 1 2021-01-01 foo
# 2 2021-01-02 bar
# 3 2021-01-03 bar
# 4 2021-01-04 foo

I've created a new object called file3 just in case you don't want to modify your original. If you don't mind overwriting the original, this can be shortened down as:

file1[file1$date == "", ] <- file2[file2$date != "", ]

Edit: @jay.sf asks how this would work with NA in the data. I've extended the OP's original example to show how I would handle NA in my approach:

date1 <- c("2021-01-01", "2021-01-02", "", "", NA)
X1 <- c("foo", "bar", "", "", NA)
date2 <- c("", "", "2021-01-03", "2021-01-04", "2021-01-05")
X2 <- c("", "", "bar", "foo", "bar")

file1 <- data.frame(date = date1, X = X1)
file2 <- data.frame(date = date2, X = X2)

file1
# > file1
# date    X
# 1 2021-01-01  foo
# 2 2021-01-02  bar
# 3                
# 4                
# 5       <NA> <NA>

file2
# > file2
# date   X
# 1               
# 2               
# 3 2021-01-03 bar
# 4 2021-01-04 foo
# 5 2021-01-05 bar

file3 <- file1
empty_rows <- file1$date == "" | is.na(file1$date)
file3[empty_rows, ] <- file2[empty_rows, ]
file3

# > file3
# date   X
# 1 2021-01-01 foo
# 2 2021-01-02 bar
# 3 2021-01-03 bar
# 4 2021-01-04 foo
# 5 2021-01-05 bar

I've consolidated it a bit by creating the object empty_rows, which is a logical indicating rows in file1 that have empty text or NA.

Merged data frame leaving some columns blank

A working version of your example:

# create data
df_finance <- data.frame(
    member_id = c(rep(1111, 3), rep(2234, 2), rep(3355,3)),
    descrip   = rep("x", 8))

df_membership <- data.frame(
    member_id = c(1111, 2234, 3355),
    alt_id    = c(9876, 7777, 522))

# with base R
merge(df_finance, df_membership, by="member_id", all.x=T)

  member_id descrip alt_id
1      1111       x   9876
2      1111       x   9876
3      1111       x   9876
4      2234       x   7777
5      2234       x   7777
6      3355       x    522
7      3355       x    522
8      3355       x    522

# tidyverse style
dplyr::left_join(df_finance, df_membership, by="member_id")

  member_id descrip alt_id
1      1111       x   9876
2      1111       x   9876
3      1111       x   9876
4      2234       x   7777
5      2234       x   7777
6      3355       x    522
7      3355       x    522
8      3355       x    522

In R, Merge Two Data Frames, Fill Down The Blanks