Can Dplyr Join on Multiple Columns or Composite Key

Can dplyr join on multiple columns or composite key?

Updating to use tibble()

You can pass a named vector of length greater than 1 to the by argument of left_join():

library(dplyr)

d1 <- tibble(
x = letters[1:3],
y = LETTERS[1:3],
a = rnorm(3)
)

d2 <- tibble(
x2 = letters[3:1],
y2 = LETTERS[3:1],
b = rnorm(3)
)

left_join(d1, d2, by = c("x" = "x2", "y" = "y2"))

How to join two dataframes with dplyr based on two columns with different names in each dataframe?


df3 <- dplyr::left_join(df1, df2, by=c("name1" = "name3", "name2" = "name4"))

Compare multiple pairs of x/y columns after left join and if different use y in R

It may be easier with coalesce (if there are not much conditions or else can use case_when). In addition, assuming that there are always .y columns for the corresponding .x column, loop across the .x columns, replace the substring .x of the column name (cur_column()) with .y, get the value, apply case_when, update the column name within .name and remove the unused columns using .keep

library(dplyr)
library(stringr)
left_join(df1, df1_update, by = "id") %>%
mutate(across(ends_with('.x'),
~ {
xdat <- as.character(.x)
ydat <- as.character(get(str_replace(cur_column(), '\\.x', '.y')))
case_when(is.na(ydat) ~ xdat,
xdat == ydat ~ xdat,
xdat != ydat ~ ydat)
},
.names = "{str_remove(.col, '.x')}"), .keep = 'unused') %>%
type.convert(as.is = TRUE)

-output

 id new_info       date   type
1 1 note 2020-01-25 type_A
2 2 nil 2020-12-03 type_B
3 3 note 2019-08-20 type_B
4 4 <NA> 2021-01-01 type_C
5 5 <NA> 2021-02-01 type_B

Joining multiple data frames based on columns values using dplyr


library(tidyverse)
bind_rows(df1, df2, df3, .id = "week") %>%
rowid_to_column() %>% # Added for nonunique combos of Camp/Sunday_endwk
pivot_wider(c(Campaign_Name, Sunday_endwk, rowid),
names_from = week, values_from = Actual_Sales:Adj_Rsquared)

Result:

# A tibble: 5 x 14
Campaign_Name Sunday_endwk Actual_Sales_1 Actual_Sales_2 Actual_Sales_3 Predictions_1 Predictions_2 Predictions_3 Version_1 Version_2 Version_3 Adj_Rsquared_1 Adj_Rsquared_2 Adj_Rsquared_3
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 Z019 20190106 12 12.2 2 11.9 11.8 2.03 layer_1 layer_2 layer_3 0.85 0.88 0.82
2 Z005 20190113 2 10.2 10 2.03 10.1 10.5 layer_1 layer_2 layer_3 0.85 0.88 0.82
3 Z019 20190113 5 2.2 12 5.1 2.05 12.3 layer_1 layer_2 layer_3 0.85 0.88 0.82
4 Z005 20190106 10 5.2 12 10.5 5.4 11.9 layer_1 layer_2 layer_3 0.85 0.88 0.82
5 Z019 20190120 NA NA 5 NA NA 5.1 NA NA layer_3 NA NA 0.82

Joining tables using variable columns - dplyr, r, join

An option is to filter with if_any and then bind the subset rows with the df_att

library(dplyr)
df_att2 <- df_alias %>%
filter(if_any(everything(), ~ .x %in% df_att$equip)) %>%
arrange(na.omit(unlist(across(everything(), ~ match(df_att$equip, .x))))) %>%
bind_cols(df_att, .)

-checking with OP's expected (changed the object name 'df_att' to 'out' to avoid any confusion)

> all.equal(df_att2, out)
[1] TRUE

How to join tables with multiple primary keys in R (efficiently)?

Concerning your second question. You can use suffix to add a suffix to the duplicated columns so that they can be easily dropped after the join.

To answer your first question we can benchmark the different options. As you are concerned with efficiency I also compare dplyr::left_join to a base R approach using merge which gives us four options

  1. left_join by all key columns
  2. left_join by only one key + getting rid of duplicates
  3. merge by all key columns
  4. merge by only one key + getting rid of duplicates

According to my benchmark on your example data, left_join by all keys is faster than joining by one and getting rid of duplicates afterwards. However, if you are concerned with efficiency than I would recommed to have a look at merge

library(dplyr)

tab1 <- data.frame(
id = c("a", "b", "c", "d"),
name = c("Mike", "Anna", "John", "Edward"),
score = c(10, 20, 30, 20)
)
tab2 <- data.frame(
id = c("a", "b", "c", "d"),
name = c("Mike", "Anna", "John", "Edward"),
color = c("red", "blue", "blue", "orange")
)
dplyr::left_join(tab1, tab2)
#> Joining, by = c("id", "name")
#> id name score color
#> 1 a Mike 10 red
#> 2 b Anna 20 blue
#> 3 c John 30 blue
#> 4 d Edward 20 orange

f1 <- function() left_join(tab1, tab2)
f2 <- function() {
left_join(tab1, tab2, by = "id", suffix = c("", "_drop")) %>%
select(-ends_with("_drop"))
}
f3 <- function() merge(tab1, tab2, by = c("id", "name"), all.x = TRUE)
f4 <- function() {
x <- merge(tab1, tab2, by = "id", all.x = TRUE, suffix = c("", "_drop"))
x[, names(x)[!grepl("_drop$", names(x))]]
}

microbenchmark::microbenchmark(f1(), f2(), f3(), f4())

#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> f1() 1574.201 1755.9010 2141.5060 1971.501 2303.651 6831.900 100 b
#> f2() 2861.602 3040.8505 3841.6990 3264.551 3920.701 17639.802 100 c
#> f3() 471.801 527.6515 657.6511 588.501 702.851 2607.201 100 a
#> f4() 395.800 442.3005 583.3340 491.801 590.751 3824.800 100 a

Dynamic variables names in dplyr function across multiple columns

We could use .names in across to rename

mean_fun_multicols <- function(data, group_cols, summary_cols) {
data %>%
group_by(across({{group_cols}})) %>%
summarise(across({{ summary_cols }},
~ mean(., na.rm = TRUE), .names = "mean_{.col}"), .groups = "drop")
}

-testing

mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt))
# A tibble: 8 × 4
cyl gear mean_mpg mean_wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37

NOTE: The := is mainly used when there is a single column in tidyverse


If we use the OP's function, we are assigning multiple columns to a single column and this returns a tibble instead of a normal column. We may need to unpack

library(tidyr)
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% str
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
grouped_df [8 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ cyl : num [1:8] 4 4 4 6 6 6 8 8
$ gear : num [1:8] 3 4 5 3 4 5 3 5
$ mean_c(mpg, wt): tibble [8 × 2] (S3: tbl_df/tbl/data.frame)
..$ mpg: num [1:8] 21.5 26.9 28.2 19.8 19.8 ...
..$ wt : num [1:8] 2.46 2.38 1.83 3.34 3.09 ...
- attr(*, "groups")= tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
..$ cyl : num [1:3] 4 6 8
..$ .rows: list<int> [1:3]
.. ..$ : int [1:3] 1 2 3
.. ..$ : int [1:3] 4 5 6
.. ..$ : int [1:2] 7 8
.. ..@ ptype: int(0)
..- attr(*, ".drop")= logi TRUE

> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>%
unpack(where(is_tibble))
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
# A tibble: 8 × 4
# Groups: cyl [3]
cyl gear mpg wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37

Performing a left join on multiple columns, one of which is a partial string

How about this?

library(dplyr)

df1 %>%
mutate(first_name_1st2char = substr(first_name, 1, 2)) %>%
left_join(df2 %>% mutate(first_name_1st2char = substr(first_name, 1, 2)),
by = c("first_name_1st2char", "last_name", "year")) %>%
select(-first_name_1st2char)

Output is:

  first_name.x last_name year first_name.y age
1 john asdf 2018 joe 12
2 jack qwerty 2017 jake 34

Sample data:

df1 <- structure(list(first_name = structure(c(2L, 1L), .Label = c("jack", 
"john"), class = "factor"), last_name = structure(1:2, .Label = c("asdf",
"qwerty"), class = "factor"), year = c(2018, 2017)), .Names = c("first_name",
"last_name", "year"), row.names = c(NA, -2L), class = "data.frame")

df2 <- structure(list(first_name = structure(c(3L, 2L, 1L), .Label = c("donald",
"jake", "joe"), class = "factor"), last_name = structure(c(1L,
3L, 2L), .Label = c("asdf", "jong", "qwerty"), class = "factor"),
year = c(2018, 2017, 2018), age = c(12, 34, 5)), .Names = c("first_name",
"last_name", "year", "age"), row.names = c(NA, -3L), class = "data.frame")


Related Topics



Leave a reply



Submit