Can Dplyr Join on Multiple Columns or Composite Key

Can dplyr join on multiple columns or composite key?

Updating to use tibble()

You can pass a named vector of length greater than 1 to the by argument of left_join():

library(dplyr)

d1 <- tibble(
  x = letters[1:3],
  y = LETTERS[1:3],
  a = rnorm(3)
  )

d2 <- tibble(
  x2 = letters[3:1],
  y2 = LETTERS[3:1],
  b = rnorm(3)
  )

left_join(d1, d2, by = c("x" = "x2", "y" = "y2"))

How to join two dataframes with dplyr based on two columns with different names in each dataframe?

df3 <- dplyr::left_join(df1, df2, by=c("name1" = "name3", "name2" = "name4"))

Compare multiple pairs of x/y columns after left join and if different use y in R

It may be easier with coalesce (if there are not much conditions or else can use case_when). In addition, assuming that there are always .y columns for the corresponding .x column, loop across the .x columns, replace the substring .x of the column name (cur_column()) with .y, get the value, apply case_when, update the column name within .name and remove the unused columns using .keep

library(dplyr)
library(stringr)
left_join(df1, df1_update, by = "id") %>% 
   mutate(across(ends_with('.x'), 
   ~ {
      xdat <- as.character(.x)
      ydat <- as.character(get(str_replace(cur_column(), '\\.x', '.y')))
      case_when(is.na(ydat) ~ xdat, 
              xdat == ydat ~ xdat,
               xdat != ydat ~ ydat)
     }, 
      .names = "{str_remove(.col, '.x')}"), .keep = 'unused') %>%
   type.convert(as.is = TRUE)

-output

 id new_info       date   type
1  1     note 2020-01-25 type_A
2  2      nil 2020-12-03 type_B
3  3     note 2019-08-20 type_B
4  4     <NA> 2021-01-01 type_C
5  5     <NA> 2021-02-01 type_B

Joining multiple data frames based on columns values using dplyr

library(tidyverse)
bind_rows(df1, df2, df3, .id = "week") %>%
  rowid_to_column() %>%   # Added for nonunique combos of Camp/Sunday_endwk
  pivot_wider(c(Campaign_Name, Sunday_endwk, rowid), 
              names_from = week, values_from = Actual_Sales:Adj_Rsquared)

Result:

# A tibble: 5 x 14
  Campaign_Name Sunday_endwk Actual_Sales_1 Actual_Sales_2 Actual_Sales_3 Predictions_1 Predictions_2 Predictions_3 Version_1 Version_2 Version_3 Adj_Rsquared_1 Adj_Rsquared_2 Adj_Rsquared_3
  <chr>         <chr>                 <dbl>          <dbl>          <dbl>         <dbl>         <dbl>         <dbl> <chr>     <chr>     <chr>              <dbl>          <dbl>          <dbl>
1 Z019          20190106                 12           12.2              2         11.9          11.8           2.03 layer_1   layer_2   layer_3             0.85           0.88           0.82
2 Z005          20190113                  2           10.2             10          2.03         10.1          10.5  layer_1   layer_2   layer_3             0.85           0.88           0.82
3 Z019          20190113                  5            2.2             12          5.1           2.05         12.3  layer_1   layer_2   layer_3             0.85           0.88           0.82
4 Z005          20190106                 10            5.2             12         10.5           5.4          11.9  layer_1   layer_2   layer_3             0.85           0.88           0.82
5 Z019          20190120                 NA           NA                5         NA            NA             5.1  NA        NA        layer_3            NA             NA              0.82

Joining tables using variable columns - dplyr, r, join

An option is to filter with if_any and then bind the subset rows with the df_att

library(dplyr)
df_att2 <- df_alias %>%
    filter(if_any(everything(), ~ .x %in% df_att$equip)) %>% 
    arrange(na.omit(unlist(across(everything(), ~ match(df_att$equip, .x))))) %>% 
    bind_cols(df_att, .)

-checking with OP's expected (changed the object name 'df_att' to 'out' to avoid any confusion)

> all.equal(df_att2, out)
[1] TRUE

How to join tables with multiple primary keys in R (efficiently)?

Concerning your second question. You can use suffix to add a suffix to the duplicated columns so that they can be easily dropped after the join.

To answer your first question we can benchmark the different options. As you are concerned with efficiency I also compare dplyr::left_join to a base R approach using merge which gives us four options

left_join by all key columns
left_join by only one key + getting rid of duplicates
merge by all key columns
merge by only one key + getting rid of duplicates

According to my benchmark on your example data, left_join by all keys is faster than joining by one and getting rid of duplicates afterwards. However, if you are concerned with efficiency than I would recommed to have a look at merge

library(dplyr)

tab1 <- data.frame(
  id = c("a", "b", "c", "d"),
  name = c("Mike", "Anna", "John", "Edward"),
  score = c(10, 20, 30, 20)
)
tab2 <- data.frame(
  id = c("a", "b", "c", "d"),
  name = c("Mike", "Anna", "John", "Edward"),
  color = c("red", "blue", "blue", "orange")
)
dplyr::left_join(tab1, tab2)
#> Joining, by = c("id", "name")
#>   id   name score  color
#> 1  a   Mike    10    red
#> 2  b   Anna    20   blue
#> 3  c   John    30   blue
#> 4  d Edward    20 orange

f1 <- function() left_join(tab1, tab2) 
f2 <- function() { 
  left_join(tab1, tab2, by = "id", suffix = c("", "_drop")) %>% 
    select(-ends_with("_drop")) 
}
f3 <- function() merge(tab1, tab2, by = c("id", "name"), all.x = TRUE)
f4 <- function() { 
  x <- merge(tab1, tab2, by = "id", all.x = TRUE, suffix = c("", "_drop"))
  x[, names(x)[!grepl("_drop$", names(x))]] 
}

microbenchmark::microbenchmark(f1(), f2(), f3(), f4())

#> Unit: microseconds
#>  expr      min        lq      mean   median       uq       max neval cld
#>  f1() 1574.201 1755.9010 2141.5060 1971.501 2303.651  6831.900   100  b 
#>  f2() 2861.602 3040.8505 3841.6990 3264.551 3920.701 17639.802   100   c
#>  f3()  471.801  527.6515  657.6511  588.501  702.851  2607.201   100 a  
#>  f4()  395.800  442.3005  583.3340  491.801  590.751  3824.800   100 a

Dynamic variables names in dplyr function across multiple columns

We could use .names in across to rename

mean_fun_multicols <- function(data, group_cols, summary_cols) {
  data %>%
    group_by(across({{group_cols}})) %>%
     summarise(across({{ summary_cols }},
         ~ mean(., na.rm = TRUE), .names = "mean_{.col}"), .groups = "drop")
}

-testing

mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt))
# A tibble: 8 × 4
    cyl  gear mean_mpg mean_wt
  <dbl> <dbl>    <dbl>   <dbl>
1     4     3     21.5    2.46
2     4     4     26.9    2.38
3     4     5     28.2    1.83
4     6     3     19.8    3.34
5     6     4     19.8    3.09
6     6     5     19.7    2.77
7     8     3     15.0    4.10
8     8     5     15.4    3.37

NOTE: The := is mainly used when there is a single column in tidyverse

If we use the OP's function, we are assigning multiple columns to a single column and this returns a tibble instead of a normal column. We may need to unpack

library(tidyr)
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% str
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
grouped_df [8 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
 $ cyl            : num [1:8] 4 4 4 6 6 6 8 8
 $ gear           : num [1:8] 3 4 5 3 4 5 3 5
 $ mean_c(mpg, wt): tibble [8 × 2] (S3: tbl_df/tbl/data.frame)
  ..$ mpg: num [1:8] 21.5 26.9 28.2 19.8 19.8 ...
  ..$ wt : num [1:8] 2.46 2.38 1.83 3.34 3.09 ...
 - attr(*, "groups")= tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
  ..$ cyl  : num [1:3] 4 6 8
  ..$ .rows: list<int> [1:3] 
  .. ..$ : int [1:3] 1 2 3
  .. ..$ : int [1:3] 4 5 6
  .. ..$ : int [1:2] 7 8
  .. ..@ ptype: int(0) 
  ..- attr(*, ".drop")= logi TRUE

> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% 
        unpack(where(is_tibble))
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
# A tibble: 8 × 4
# Groups:   cyl [3]
    cyl  gear   mpg    wt
  <dbl> <dbl> <dbl> <dbl>
1     4     3  21.5  2.46
2     4     4  26.9  2.38
3     4     5  28.2  1.83
4     6     3  19.8  3.34
5     6     4  19.8  3.09
6     6     5  19.7  2.77
7     8     3  15.0  4.10
8     8     5  15.4  3.37

Performing a left join on multiple columns, one of which is a partial string

How about this?

library(dplyr)

df1 %>%
  mutate(first_name_1st2char = substr(first_name, 1, 2)) %>%
  left_join(df2 %>% mutate(first_name_1st2char = substr(first_name, 1, 2)), 
            by = c("first_name_1st2char", "last_name", "year")) %>%
  select(-first_name_1st2char)

Output is:

  first_name.x last_name year first_name.y age
1         john      asdf 2018          joe  12
2         jack    qwerty 2017         jake  34

Sample data:

df1 <- structure(list(first_name = structure(c(2L, 1L), .Label = c("jack", 
"john"), class = "factor"), last_name = structure(1:2, .Label = c("asdf", 
"qwerty"), class = "factor"), year = c(2018, 2017)), .Names = c("first_name", 
"last_name", "year"), row.names = c(NA, -2L), class = "data.frame")

df2 <- structure(list(first_name = structure(c(3L, 2L, 1L), .Label = c("donald", 
"jake", "joe"), class = "factor"), last_name = structure(c(1L, 
3L, 2L), .Label = c("asdf", "jong", "qwerty"), class = "factor"), 
    year = c(2018, 2017, 2018), age = c(12, 34, 5)), .Names = c("first_name", 
"last_name", "year", "age"), row.names = c(NA, -3L), class = "data.frame")

Can Dplyr Join on Multiple Columns or Composite Key