Can dplyr join on multiple columns or composite key?
Updating to use tibble()
You can pass a named vector of length greater than 1 to the by
argument of left_join()
:
library(dplyr)
d1 <- tibble(
x = letters[1:3],
y = LETTERS[1:3],
a = rnorm(3)
)
d2 <- tibble(
x2 = letters[3:1],
y2 = LETTERS[3:1],
b = rnorm(3)
)
left_join(d1, d2, by = c("x" = "x2", "y" = "y2"))
How to join two dataframes with dplyr based on two columns with different names in each dataframe?
df3 <- dplyr::left_join(df1, df2, by=c("name1" = "name3", "name2" = "name4"))
Compare multiple pairs of x/y columns after left join and if different use y in R
It may be easier with coalesce
(if there are not much conditions or else can use case_when
). In addition, assuming that there are always .y
columns for the corresponding .x
column, loop across
the .x
columns, replace the substring .x
of the column name (cur_column()
) with .y
, get
the value, apply case_when
, update the column name within .name
and remove the unused
columns using .keep
library(dplyr)
library(stringr)
left_join(df1, df1_update, by = "id") %>%
mutate(across(ends_with('.x'),
~ {
xdat <- as.character(.x)
ydat <- as.character(get(str_replace(cur_column(), '\\.x', '.y')))
case_when(is.na(ydat) ~ xdat,
xdat == ydat ~ xdat,
xdat != ydat ~ ydat)
},
.names = "{str_remove(.col, '.x')}"), .keep = 'unused') %>%
type.convert(as.is = TRUE)
-output
id new_info date type
1 1 note 2020-01-25 type_A
2 2 nil 2020-12-03 type_B
3 3 note 2019-08-20 type_B
4 4 <NA> 2021-01-01 type_C
5 5 <NA> 2021-02-01 type_B
Joining multiple data frames based on columns values using dplyr
library(tidyverse)
bind_rows(df1, df2, df3, .id = "week") %>%
rowid_to_column() %>% # Added for nonunique combos of Camp/Sunday_endwk
pivot_wider(c(Campaign_Name, Sunday_endwk, rowid),
names_from = week, values_from = Actual_Sales:Adj_Rsquared)
Result:
# A tibble: 5 x 14
Campaign_Name Sunday_endwk Actual_Sales_1 Actual_Sales_2 Actual_Sales_3 Predictions_1 Predictions_2 Predictions_3 Version_1 Version_2 Version_3 Adj_Rsquared_1 Adj_Rsquared_2 Adj_Rsquared_3
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 Z019 20190106 12 12.2 2 11.9 11.8 2.03 layer_1 layer_2 layer_3 0.85 0.88 0.82
2 Z005 20190113 2 10.2 10 2.03 10.1 10.5 layer_1 layer_2 layer_3 0.85 0.88 0.82
3 Z019 20190113 5 2.2 12 5.1 2.05 12.3 layer_1 layer_2 layer_3 0.85 0.88 0.82
4 Z005 20190106 10 5.2 12 10.5 5.4 11.9 layer_1 layer_2 layer_3 0.85 0.88 0.82
5 Z019 20190120 NA NA 5 NA NA 5.1 NA NA layer_3 NA NA 0.82
Joining tables using variable columns - dplyr, r, join
An option is to filter
with if_any
and then bind the subset rows with the df_att
library(dplyr)
df_att2 <- df_alias %>%
filter(if_any(everything(), ~ .x %in% df_att$equip)) %>%
arrange(na.omit(unlist(across(everything(), ~ match(df_att$equip, .x))))) %>%
bind_cols(df_att, .)
-checking with OP's expected (changed the object name 'df_att' to 'out' to avoid any confusion)
> all.equal(df_att2, out)
[1] TRUE
How to join tables with multiple primary keys in R (efficiently)?
Concerning your second question. You can use suffix
to add a suffix to the duplicated columns so that they can be easily dropped after the join.
To answer your first question we can benchmark the different options. As you are concerned with efficiency I also compare dplyr::left_join
to a base R approach using merge
which gives us four options
left_join
by all key columnsleft_join
by only one key + getting rid of duplicatesmerge
by all key columnsmerge
by only one key + getting rid of duplicates
According to my benchmark on your example data, left_join
by all keys is faster than joining by one and getting rid of duplicates afterwards. However, if you are concerned with efficiency than I would recommed to have a look at merge
library(dplyr)
tab1 <- data.frame(
id = c("a", "b", "c", "d"),
name = c("Mike", "Anna", "John", "Edward"),
score = c(10, 20, 30, 20)
)
tab2 <- data.frame(
id = c("a", "b", "c", "d"),
name = c("Mike", "Anna", "John", "Edward"),
color = c("red", "blue", "blue", "orange")
)
dplyr::left_join(tab1, tab2)
#> Joining, by = c("id", "name")
#> id name score color
#> 1 a Mike 10 red
#> 2 b Anna 20 blue
#> 3 c John 30 blue
#> 4 d Edward 20 orange
f1 <- function() left_join(tab1, tab2)
f2 <- function() {
left_join(tab1, tab2, by = "id", suffix = c("", "_drop")) %>%
select(-ends_with("_drop"))
}
f3 <- function() merge(tab1, tab2, by = c("id", "name"), all.x = TRUE)
f4 <- function() {
x <- merge(tab1, tab2, by = "id", all.x = TRUE, suffix = c("", "_drop"))
x[, names(x)[!grepl("_drop$", names(x))]]
}
microbenchmark::microbenchmark(f1(), f2(), f3(), f4())
#> Unit: microseconds
#> expr min lq mean median uq max neval cld
#> f1() 1574.201 1755.9010 2141.5060 1971.501 2303.651 6831.900 100 b
#> f2() 2861.602 3040.8505 3841.6990 3264.551 3920.701 17639.802 100 c
#> f3() 471.801 527.6515 657.6511 588.501 702.851 2607.201 100 a
#> f4() 395.800 442.3005 583.3340 491.801 590.751 3824.800 100 a
Dynamic variables names in dplyr function across multiple columns
We could use .names
in across
to rename
mean_fun_multicols <- function(data, group_cols, summary_cols) {
data %>%
group_by(across({{group_cols}})) %>%
summarise(across({{ summary_cols }},
~ mean(., na.rm = TRUE), .names = "mean_{.col}"), .groups = "drop")
}
-testing
mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt))
# A tibble: 8 × 4
cyl gear mean_mpg mean_wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37
NOTE: The :=
is mainly used when there is a single column in tidyverse
If we use the OP's function, we are assigning multiple columns to a single column and this returns a tibble
instead of a normal column. We may need to unpack
library(tidyr)
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% str
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
grouped_df [8 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ cyl : num [1:8] 4 4 4 6 6 6 8 8
$ gear : num [1:8] 3 4 5 3 4 5 3 5
$ mean_c(mpg, wt): tibble [8 × 2] (S3: tbl_df/tbl/data.frame)
..$ mpg: num [1:8] 21.5 26.9 28.2 19.8 19.8 ...
..$ wt : num [1:8] 2.46 2.38 1.83 3.34 3.09 ...
- attr(*, "groups")= tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
..$ cyl : num [1:3] 4 6 8
..$ .rows: list<int> [1:3]
.. ..$ : int [1:3] 1 2 3
.. ..$ : int [1:3] 4 5 6
.. ..$ : int [1:2] 7 8
.. ..@ ptype: int(0)
..- attr(*, ".drop")= logi TRUE
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>%
unpack(where(is_tibble))
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
# A tibble: 8 × 4
# Groups: cyl [3]
cyl gear mpg wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37
Performing a left join on multiple columns, one of which is a partial string
How about this?
library(dplyr)
df1 %>%
mutate(first_name_1st2char = substr(first_name, 1, 2)) %>%
left_join(df2 %>% mutate(first_name_1st2char = substr(first_name, 1, 2)),
by = c("first_name_1st2char", "last_name", "year")) %>%
select(-first_name_1st2char)
Output is:
first_name.x last_name year first_name.y age
1 john asdf 2018 joe 12
2 jack qwerty 2017 jake 34
Sample data:
df1 <- structure(list(first_name = structure(c(2L, 1L), .Label = c("jack",
"john"), class = "factor"), last_name = structure(1:2, .Label = c("asdf",
"qwerty"), class = "factor"), year = c(2018, 2017)), .Names = c("first_name",
"last_name", "year"), row.names = c(NA, -2L), class = "data.frame")
df2 <- structure(list(first_name = structure(c(3L, 2L, 1L), .Label = c("donald",
"jake", "joe"), class = "factor"), last_name = structure(c(1L,
3L, 2L), .Label = c("asdf", "jong", "qwerty"), class = "factor"),
year = c(2018, 2017, 2018), age = c(12, 34, 5)), .Names = c("first_name",
"last_name", "year", "age"), row.names = c(NA, -3L), class = "data.frame")
Related Topics
How to Get a Barplot with Several Variables Side by Side Grouped by a Factor
Align Multiple Plots in Ggplot2 When Some Have Legends and Others Don'T
Join Two Data Frames in R Based on Closest Timestamp
How to Align the Bars of a Histogram with the X Axis
How to Complete Missing Factor Levels in Data Frame
Using Two Scale Colour Gradients on One Ggplot
Spreading a Two Column Data Frame with Tidyr
Purrr Map Equivalent of Nested for Loop
R: How to Split a Data Frame into Training, Validation, and Test Sets
Using R Statistics Add a Group Sum to Each Row
Ggplot2 Multiple Scales/Legends Per Aesthetic, Revisited
Rstudio Shiny List from Checking Rows in Datatables
How to Color Sliderbar (Sliderinput)
Replace Empty Values with Value from Other Column in a Dataframe