Dplyr: Mutate_At + Coalesce: Dynamic Names of Columns

dplyr: mutate_at + coalesce: dynamic names of columns

We can split the dataset into a list of data.frames after removing the substring of column names ("_extra"), then with map loop through the list, coalesce the column and then bindwith the "_extra" columns in the original dataset

library(tidyverse)
data_example %>%
split.default(str_remove(names(.), "_extra")) %>%
map_df(~ coalesce(!!! .x)) %>%
#or use
# map_df(reduce, coalesce) %>%
bind_cols(., select(data_example, ends_with("extra")))
# A tibble: 3 x 5
# aa bb cc aa_extra bb_extra
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 6 2 1
#2 2 2 7 2 2
#3 NA 2 8 NA 3

Dynamic variables names in dplyr function across multiple columns

We could use .names in across to rename

mean_fun_multicols <- function(data, group_cols, summary_cols) {
data %>%
group_by(across({{group_cols}})) %>%
summarise(across({{ summary_cols }},
~ mean(., na.rm = TRUE), .names = "mean_{.col}"), .groups = "drop")
}

-testing

mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt))
# A tibble: 8 × 4
cyl gear mean_mpg mean_wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37

NOTE: The := is mainly used when there is a single column in tidyverse


If we use the OP's function, we are assigning multiple columns to a single column and this returns a tibble instead of a normal column. We may need to unpack

library(tidyr)
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% str
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
grouped_df [8 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ cyl : num [1:8] 4 4 4 6 6 6 8 8
$ gear : num [1:8] 3 4 5 3 4 5 3 5
$ mean_c(mpg, wt): tibble [8 × 2] (S3: tbl_df/tbl/data.frame)
..$ mpg: num [1:8] 21.5 26.9 28.2 19.8 19.8 ...
..$ wt : num [1:8] 2.46 2.38 1.83 3.34 3.09 ...
- attr(*, "groups")= tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
..$ cyl : num [1:3] 4 6 8
..$ .rows: list<int> [1:3]
.. ..$ : int [1:3] 1 2 3
.. ..$ : int [1:3] 4 5 6
.. ..$ : int [1:2] 7 8
.. ..@ ptype: int(0)
..- attr(*, ".drop")= logi TRUE

> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>%
unpack(where(is_tibble))
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
# A tibble: 8 × 4
# Groups: cyl [3]
cyl gear mpg wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37

creating and accessing dynamic column names within dplyr functions

Use across with the .names argument or if foo_cnt, etc. with an underscore is ok then just omit the .names argument since that is the default.

library(dplyr)
library(tibble)

do.some.stuff.2 <- function(data, col) {
cnt <- function(x) cumsum(!is.na(x))
mx <- function(x) cummax(cumsum(x))
mu <- function(x) cumsum(x) / cnt(x)
data %>%
select(date, {{col}}) %>%
filter(!is.na(date) & !is.na({{col}})) %>%
mutate(across({{col}}, lst(cnt, sum=cumsum, max=mx, mu), .names = "{.col}.{.fn}" ))
}
# test
do.some.stuff.2(example, foo)

giving:

# A tibble: 6 x 6
date foo foo.cnt foo.sum foo.max foo.mu
<date> <dbl> <int> <dbl> <dbl> <dbl>
1 2021-02-11 -0.000202 1 -0.000202 -0.000202 -0.000202
2 2021-02-12 0.363 2 0.363 0.363 0.181
3 2021-02-13 1.27 3 1.63 1.63 0.543
4 2021-02-14 1.50 4 3.13 3.13 0.781
5 2021-02-15 1.00 5 4.13 4.13 0.826
6 2021-02-16 -0.458 6 3.67 4.13 0.612

dplyr mutate_at but using any instead of all

mutate_at has been deprecated, across() is now preferred which works with the any_of() and all_of() helper functions (and other select helpers).

df |> mutate(across(any_of(col_names), as.numeric))

See the ?across help page for more options and examples.

Pass a string as variable name in dplyr::coalesce

An option would be to convert the strings to symbols (syms from rlang) and then evaluate (!!!)

library(dplyr)
tb %>%
mutate(combined = coalesce(!!! rlang::syms(uCols)))
# A tibble: 5 x 4
# a b c combined
# <chr> <chr> <chr> <chr>
#1 a <NA> c a
#2 <NA> b c b
#3 a <NA> c a
#4 <NA> <NA> c c
#5 a <NA> <NA> a

Or another option is do.call

tb %>%
mutate(combined = select(., uCols) %>%
do.call(coalesce, .))

Coalesce pairs of variables within a dataframe based on a regular expression

You could use transmute, e.g.

library(dplyr)

df <- data.frame(
A_1 = c(NA, NA, 3, 4, 5),
A_2 = c(1, 2, NA, NA, NA),
B_1 = c(NA, NA, 13, 14, 15),
B_2 = c(11, 12, NA, NA, NA)
)

df %>%
transmute(A = coalesce(A_1, A_2),
B = coalesce(B_1, B_2))
#> A B
#> 1 1 11
#> 2 2 12
#> 3 3 13
#> 4 4 14
#> 5 5 15

Created on 2021-12-22 by the reprex package (v2.0.1)

Another option, if you have lots of "A_*" and "B_*" columns (source: Romain François, user: @Romain Francois):

library(dplyr)

df <- data.frame(
A_1 = c(NA, NA, 3, 4, 5),
A_2 = c(1, 2, NA, NA, NA),
B_1 = c(NA, NA, 13, 14, 15),
B_2 = c(11, 12, NA, NA, NA)
)

coacross <- function(...) {
coalesce(!!!across(...))
}

df %>%
transmute(A = coacross(starts_with("A_")),
B = coacross(starts_with("B_")))
#> A B
#> 1 1 11
#> 2 2 12
#> 3 3 13
#> 4 4 14
#> 5 5 15

Created on 2021-12-22 by the reprex package (v2.0.1)

Edit

Based on your updated question, you don't have lots of "A_*" or "B_*" columns, but instead lots of "*_1", "*_2", and "*_3" columns. I think this is the most straightforward solution for your use-case:

library(dplyr)

df <- data.frame(Al_TAC4.25.275 = c(1, 1, 1, NA, NA, NA),
Al_TAC4.25.276 = c(NA, NA, 2, 2, 2, NA),
Al_TAC4.25.277 = c(NA, NA, 3, NA, NA, 3),
Au_TAC4.25.275 = c(1, 1, 1, NA, NA, NA),
Au_TAC4.25.276 = c(NA, NA, 2, 2, 2, NA),
Au_TAC4.25.277 = c(NA, NA, 3, NA, NA, NA),
Ar_TAC4.25.275 = c(1, 1, 1, NA, NA, 1),
Ar_TAC4.25.276 = c(NA, NA, 2, 2, 2, 2),
Ar_TAC4.25.277 = c(NA, NA, 3, NA, NA, 3))

df
#> Al_TAC4.25.275 Al_TAC4.25.276 Al_TAC4.25.277 Au_TAC4.25.275 Au_TAC4.25.276
#> 1 1 NA NA 1 NA
#> 2 1 NA NA 1 NA
#> 3 1 2 3 1 2
#> 4 NA 2 NA NA 2
#> 5 NA 2 NA NA 2
#> 6 NA NA 3 NA NA
#> Au_TAC4.25.277 Ar_TAC4.25.275 Ar_TAC4.25.276 Ar_TAC4.25.277
#> 1 NA 1 NA NA
#> 2 NA 1 NA NA
#> 3 3 1 2 3
#> 4 NA NA 2 NA
#> 5 NA NA 2 NA
#> 6 NA 1 2 3

names(df) %>%
split(str_extract(., '[:alpha:]+')) %>%
map_dfc(~ coalesce(!!!df[.x][c(1,2,3)]))
#> # A tibble: 6 × 3
#> Al Ar Au
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 1
#> 3 1 1 1
#> 4 2 2 2
#> 5 2 2 2
#> 6 3 1 NA

# change the order of the list to change the 'priority'
names(df) %>%
split(str_extract(., '[:alpha:]+')) %>%
map_dfc(~ coalesce(!!!df[.x][c(3,2,1)]))
#> # A tibble: 6 × 3
#> Al Ar Au
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 1
#> 3 3 3 3
#> 4 2 2 2
#> 5 2 2 2
#> 6 3 3 NA

names(df) %>%
split(str_extract(., '[:alpha:]+')) %>%
map_dfc(~ coalesce(!!!df[.x][c(2,1,3)]))
#> # A tibble: 6 × 3
#> Al Ar Au
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 1
#> 3 2 2 2
#> 4 2 2 2
#> 5 2 2 2
#> 6 3 2 NA

Created on 2021-12-22 by the reprex package (v2.0.1)

Keeping original variables with dplyr::mutate_at & varying length of dynamic variables

EDIT

Since , the strings are there in a variable, we cannot hardcode it.

varlist <- c('helloo', 'ooooHH')

we can modify the function to rename varlist object instead.

dataframe %>% 
mutate_at(
vars(varlist),
.funs = funs(cat = ntile(., 2))
) %>%
rename_at(vars(grep("^cat$", names(.))),
funs(sub("cat", paste0(varlist, "_cat"), .)))

Original Answer

A hackish way from the same link would be using rename_at to replace only when we find an exact match for "cat"

library(dplyr)
dataframe %>%
mutate_at(
vars('ooooHH'),
.funs = funs(cat = ntile(., 2))
) %>%
rename_at(vars(grep("^cat$", names(.))), funs(sub("cat", "ooooHH_cat", .)))

# helloo ooooHH ahaaa ooooHH_cat
# <dbl> <dbl> <dbl> <int>
#1 1.00 1.00 200 1
#2 2.00 1.00 400 1
#3 3.00 1.00 120 1
#4 4.00 2.00 300 2
#5 5.00 2.00 100 2
#6 6.00 2.00 100 2

It would not impact when there are more than one column and the renaming is already applied.

dataframe %>%
mutate_at(
vars(contains("oo")),
.funs = funs(cat = ntile(., 2))
) %>%
rename_at(vars(grep("^cat$", names(.))), funs(sub("cat", "ooh_cat", .)))

# helloo ooooHH ahaaa helloo_cat ooooHH_cat
# <dbl> <dbl> <dbl> <int> <int>
#1 1.00 1.00 200 1 1
#2 2.00 1.00 400 1 1
#3 3.00 1.00 120 1 1
#4 4.00 2.00 300 2 2
#5 5.00 2.00 100 2 2
#6 6.00 2.00 100 2 2

How to coalesce chunks of columns based on a prefix pattern

Just extending the previous answer to remove the hard-coded column names and wrap a solution in a function:

transform_df <- function(start_df_, sep = '__'){

# identify columns of interest
spread_names <- grep(sep, colnames(start_df_), value = T)
other_names <- setdiff(colnames(start_df_), spread_names)

# grab the `key` part from their names
spread_keys <- unique(
sapply(
strsplit(spread_names, sep)
, `[[`
, 2
)
)

# apply the describe operation
for(key in spread_keys){
start_df_ <- start_df_ %>%
dplyr::mutate(!!key := coalesce(!!!select(., contains(key))))
}

# format and return
start_df_ %>%
dplyr::select(c(spread_keys, other_names))
}

and then you get the output

set.seed(2022)
my_df <- generate_data(n_of_chunks = 4, n_variants_in_each_chunk = 4, nrows = 10)
my_df %>% transform_df()
# A tibble: 10 x 6
`098Wb7lVaq` oe7XBd42Mk TdFPA4qYnl qZQ8AvXzTl col_foo col_blah
<dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 0.0846 0.746 0.381 0.182 a -0.583
2 0.509 0.492 0.962 0.702 b -0.198
3 0.155 0.912 0.917 0.0141 c -0.603
4 0.998 0.603 0.121 0.943 d -1.09
5 0.300 0.853 0.317 0.740 e 0.184
6 0.0486 0.598 0.330 0.122 f 1.31
7 0.0583 0.757 0.650 0.465 g -0.168
8 0.819 0.461 0.165 0.597 h 0.344
9 0.541 0.280 0.978 0.793 i 0.376
10 0.850 0.147 0.865 0.426 j -0.195

HTH

rename multiple variables with pattern mutate_at

You can use mutate_at

library(dplyr)
data %>% mutate_at(vars(ends_with("cents")), ~./100)

# A tibble: 3 x 4
# col1_cents col1 col2_cents col2
# <dbl> <dbl> <dbl> <dbl>
#1 10 NA 30 NA
#2 NA 20 NA 25.2
#3 20 NA 20.3 NA

If you then want to combine the two columns, we can use split.default to split columns based on similarity of the names, use imap_dfc from purrr along with coalesce to combine them together.

df1 <- data %>%  mutate_at(vars(ends_with("cents")), ~./100)

purrr::imap_dfc(split.default(df1, sub("_.*", "", names(df1))),
~.x %>% mutate(!!.y := coalesce(.x[[2]], .x[[1]])) %>% select(.y))

# col1 col2
# <dbl> <dbl>
#1 10 30
#2 20 25.2
#3 20 20.3


Related Topics



Leave a reply



Submit