Dplyr: Mutate_At + Coalesce: Dynamic Names of Columns

dplyr: mutate_at + coalesce: dynamic names of columns

We can split the dataset into a list of data.frames after removing the substring of column names ("_extra"), then with map loop through the list, coalesce the column and then bindwith the "_extra" columns in the original dataset

library(tidyverse)
data_example %>% 
   split.default(str_remove(names(.), "_extra")) %>%
   map_df(~ coalesce(!!! .x)) %>%
   #or use
   # map_df(reduce, coalesce) %>%
   bind_cols(., select(data_example, ends_with("extra")))
# A tibble: 3 x 5
#     aa    bb    cc aa_extra bb_extra
#  <dbl> <dbl> <dbl>    <dbl>    <dbl>
#1     1     1     6        2        1
#2     2     2     7        2        2
#3    NA     2     8       NA        3

Dynamic variables names in dplyr function across multiple columns

We could use .names in across to rename

mean_fun_multicols <- function(data, group_cols, summary_cols) {
  data %>%
    group_by(across({{group_cols}})) %>%
     summarise(across({{ summary_cols }},
         ~ mean(., na.rm = TRUE), .names = "mean_{.col}"), .groups = "drop")
}

-testing

mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt))
# A tibble: 8 × 4
    cyl  gear mean_mpg mean_wt
  <dbl> <dbl>    <dbl>   <dbl>
1     4     3     21.5    2.46
2     4     4     26.9    2.38
3     4     5     28.2    1.83
4     6     3     19.8    3.34
5     6     4     19.8    3.09
6     6     5     19.7    2.77
7     8     3     15.0    4.10
8     8     5     15.4    3.37

NOTE: The := is mainly used when there is a single column in tidyverse

If we use the OP's function, we are assigning multiple columns to a single column and this returns a tibble instead of a normal column. We may need to unpack

library(tidyr)
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% str
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
grouped_df [8 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
 $ cyl            : num [1:8] 4 4 4 6 6 6 8 8
 $ gear           : num [1:8] 3 4 5 3 4 5 3 5
 $ mean_c(mpg, wt): tibble [8 × 2] (S3: tbl_df/tbl/data.frame)
  ..$ mpg: num [1:8] 21.5 26.9 28.2 19.8 19.8 ...
  ..$ wt : num [1:8] 2.46 2.38 1.83 3.34 3.09 ...
 - attr(*, "groups")= tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
  ..$ cyl  : num [1:3] 4 6 8
  ..$ .rows: list<int> [1:3] 
  .. ..$ : int [1:3] 1 2 3
  .. ..$ : int [1:3] 4 5 6
  .. ..$ : int [1:2] 7 8
  .. ..@ ptype: int(0) 
  ..- attr(*, ".drop")= logi TRUE

> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% 
        unpack(where(is_tibble))
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
# A tibble: 8 × 4
# Groups:   cyl [3]
    cyl  gear   mpg    wt
  <dbl> <dbl> <dbl> <dbl>
1     4     3  21.5  2.46
2     4     4  26.9  2.38
3     4     5  28.2  1.83
4     6     3  19.8  3.34
5     6     4  19.8  3.09
6     6     5  19.7  2.77
7     8     3  15.0  4.10
8     8     5  15.4  3.37

creating and accessing dynamic column names within dplyr functions

Use across with the .names argument or if foo_cnt, etc. with an underscore is ok then just omit the .names argument since that is the default.

library(dplyr)
library(tibble)

do.some.stuff.2 <- function(data, col) {
  cnt <- function(x) cumsum(!is.na(x))
  mx <- function(x) cummax(cumsum(x))      
  mu <- function(x) cumsum(x) / cnt(x)
  data %>%
    select(date, {{col}}) %>%
    filter(!is.na(date) & !is.na({{col}})) %>%
    mutate(across({{col}}, lst(cnt, sum=cumsum, max=mx, mu), .names = "{.col}.{.fn}" ))
}
# test
do.some.stuff.2(example, foo)

giving:

# A tibble: 6 x 6
  date             foo foo.cnt   foo.sum   foo.max    foo.mu
  <date>         <dbl>   <int>     <dbl>     <dbl>     <dbl>
1 2021-02-11 -0.000202       1 -0.000202 -0.000202 -0.000202
2 2021-02-12  0.363          2  0.363     0.363     0.181   
3 2021-02-13  1.27           3  1.63      1.63      0.543   
4 2021-02-14  1.50           4  3.13      3.13      0.781   
5 2021-02-15  1.00           5  4.13      4.13      0.826   
6 2021-02-16 -0.458          6  3.67      4.13      0.612

dplyr mutate_at but using any instead of all

mutate_at has been deprecated, across() is now preferred which works with the any_of() and all_of() helper functions (and other select helpers).

df |> mutate(across(any_of(col_names), as.numeric))

See the ?across help page for more options and examples.

Pass a string as variable name in dplyr::coalesce

An option would be to convert the strings to symbols (syms from rlang) and then evaluate (!!!)

library(dplyr)
tb %>%
   mutate(combined = coalesce(!!! rlang::syms(uCols)))
# A tibble: 5 x 4
#  a     b     c     combined
#  <chr> <chr> <chr> <chr>   
#1 a     <NA>  c     a       
#2 <NA>  b     c     b       
#3 a     <NA>  c     a       
#4 <NA>  <NA>  c     c       
#5 a     <NA>  <NA>  a

Or another option is do.call

tb %>%
   mutate(combined = select(., uCols) %>% 
                          do.call(coalesce, .))

Coalesce pairs of variables within a dataframe based on a regular expression

You could use transmute, e.g.

library(dplyr)

df <- data.frame(
  A_1 = c(NA, NA, 3, 4, 5),
  A_2 = c(1, 2, NA, NA, NA),
  B_1 = c(NA, NA, 13, 14, 15),
  B_2 = c(11, 12, NA, NA, NA)
  )

df %>%
  transmute(A = coalesce(A_1, A_2),
            B = coalesce(B_1, B_2))
#>   A  B
#> 1 1 11
#> 2 2 12
#> 3 3 13
#> 4 4 14
#> 5 5 15

^{Created on 2021-12-22 by the reprex package (v2.0.1)}

Another option, if you have lots of "A_*" and "B_*" columns (source: Romain François, user: @Romain Francois):

library(dplyr)

df <- data.frame(
  A_1 = c(NA, NA, 3, 4, 5),
  A_2 = c(1, 2, NA, NA, NA),
  B_1 = c(NA, NA, 13, 14, 15),
  B_2 = c(11, 12, NA, NA, NA)
  )

coacross <- function(...) {
  coalesce(!!!across(...))
}

df %>%
  transmute(A = coacross(starts_with("A_")),
            B = coacross(starts_with("B_")))
#>   A  B
#> 1 1 11
#> 2 2 12
#> 3 3 13
#> 4 4 14
#> 5 5 15

^{Created on 2021-12-22 by the reprex package (v2.0.1)}

Edit

Based on your updated question, you don't have lots of "A_*" or "B_*" columns, but instead lots of "*_1", "*_2", and "*_3" columns. I think this is the most straightforward solution for your use-case:

library(dplyr)

df <- data.frame(Al_TAC4.25.275 = c(1, 1, 1, NA, NA, NA),
                 Al_TAC4.25.276 = c(NA, NA, 2, 2, 2, NA),
                 Al_TAC4.25.277 = c(NA, NA, 3, NA, NA, 3),
                 Au_TAC4.25.275 = c(1, 1, 1, NA, NA, NA),
                 Au_TAC4.25.276 = c(NA, NA, 2, 2, 2, NA),
                 Au_TAC4.25.277 = c(NA, NA, 3, NA, NA, NA),
                 Ar_TAC4.25.275 = c(1, 1, 1, NA, NA, 1),
                 Ar_TAC4.25.276 = c(NA, NA, 2, 2, 2, 2),
                 Ar_TAC4.25.277 = c(NA, NA, 3, NA, NA, 3))

df
#>   Al_TAC4.25.275 Al_TAC4.25.276 Al_TAC4.25.277 Au_TAC4.25.275 Au_TAC4.25.276
#> 1              1             NA             NA              1             NA
#> 2              1             NA             NA              1             NA
#> 3              1              2              3              1              2
#> 4             NA              2             NA             NA              2
#> 5             NA              2             NA             NA              2
#> 6             NA             NA              3             NA             NA
#>   Au_TAC4.25.277 Ar_TAC4.25.275 Ar_TAC4.25.276 Ar_TAC4.25.277
#> 1             NA              1             NA             NA
#> 2             NA              1             NA             NA
#> 3              3              1              2              3
#> 4             NA             NA              2             NA
#> 5             NA             NA              2             NA
#> 6             NA              1              2              3

names(df) %>% 
  split(str_extract(., '[:alpha:]+')) %>%
  map_dfc(~ coalesce(!!!df[.x][c(1,2,3)]))
#> # A tibble: 6 × 3
#>      Al    Ar    Au
#>   <dbl> <dbl> <dbl>
#> 1     1     1     1
#> 2     1     1     1
#> 3     1     1     1
#> 4     2     2     2
#> 5     2     2     2
#> 6     3     1    NA

# change the order of the list to change the 'priority'
names(df) %>% 
  split(str_extract(., '[:alpha:]+')) %>%
  map_dfc(~ coalesce(!!!df[.x][c(3,2,1)]))
#> # A tibble: 6 × 3
#>      Al    Ar    Au
#>   <dbl> <dbl> <dbl>
#> 1     1     1     1
#> 2     1     1     1
#> 3     3     3     3
#> 4     2     2     2
#> 5     2     2     2
#> 6     3     3    NA

names(df) %>% 
  split(str_extract(., '[:alpha:]+')) %>%
  map_dfc(~ coalesce(!!!df[.x][c(2,1,3)]))
#> # A tibble: 6 × 3
#>      Al    Ar    Au
#>   <dbl> <dbl> <dbl>
#> 1     1     1     1
#> 2     1     1     1
#> 3     2     2     2
#> 4     2     2     2
#> 5     2     2     2
#> 6     3     2    NA

^{Created on 2021-12-22 by the reprex package (v2.0.1)}

Keeping original variables with dplyr::mutate_at & varying length of dynamic variables

EDIT

Since , the strings are there in a variable, we cannot hardcode it.

varlist <- c('helloo', 'ooooHH')

we can modify the function to rename varlist object instead.

dataframe %>% 
  mutate_at(
  vars(varlist), 
   .funs = funs(cat = ntile(., 2))
 ) %>%
rename_at(vars(grep("^cat$", names(.))), 
          funs(sub("cat", paste0(varlist, "_cat"), .)))

Original Answer

A hackish way from the same link would be using rename_at to replace only when we find an exact match for "cat"

library(dplyr)
dataframe %>% 
  mutate_at(
    vars('ooooHH'), 
    .funs = funs(cat = ntile(., 2))
   ) %>%
rename_at(vars(grep("^cat$", names(.))), funs(sub("cat", "ooooHH_cat", .))) 

#   helloo ooooHH ahaaa ooooHH_cat
#   <dbl>  <dbl> <dbl>      <int>
#1   1.00   1.00   200          1
#2   2.00   1.00   400          1
#3   3.00   1.00   120          1
#4   4.00   2.00   300          2
#5   5.00   2.00   100          2
#6   6.00   2.00   100          2

It would not impact when there are more than one column and the renaming is already applied.

dataframe %>%
   mutate_at(
   vars(contains("oo")),
    .funs = funs(cat = ntile(., 2))
  ) %>%
 rename_at(vars(grep("^cat$", names(.))), funs(sub("cat", "ooh_cat", .))) 

#   helloo ooooHH ahaaa helloo_cat ooooHH_cat
#   <dbl>  <dbl> <dbl>      <int>      <int>
#1   1.00   1.00   200          1          1
#2   2.00   1.00   400          1          1
#3   3.00   1.00   120          1          1
#4   4.00   2.00   300          2          2
#5   5.00   2.00   100          2          2
#6   6.00   2.00   100          2          2

How to coalesce chunks of columns based on a prefix pattern

Just extending the previous answer to remove the hard-coded column names and wrap a solution in a function:

transform_df <- function(start_df_, sep = '__'){
  
  # identify columns of interest
  spread_names <- grep(sep, colnames(start_df_), value = T)
  other_names <- setdiff(colnames(start_df_), spread_names)
  
  # grab the `key` part from their names
  spread_keys <- unique(
    sapply(
      strsplit(spread_names, sep)
      , `[[`
      , 2
    )
  )

  # apply the describe operation
  for(key in spread_keys){
    start_df_ <- start_df_ %>% 
      dplyr::mutate(!!key := coalesce(!!!select(., contains(key))))
  }
  
  # format and return
  start_df_ %>%
    dplyr::select(c(spread_keys, other_names))
}

and then you get the output

set.seed(2022)
my_df <- generate_data(n_of_chunks = 4, n_variants_in_each_chunk = 4, nrows = 10)
my_df %>% transform_df()
# A tibble: 10 x 6
   `098Wb7lVaq` oe7XBd42Mk TdFPA4qYnl qZQ8AvXzTl col_foo col_blah
          <dbl>      <dbl>      <dbl>      <dbl> <chr>      <dbl>
 1       0.0846      0.746      0.381     0.182  a         -0.583
 2       0.509       0.492      0.962     0.702  b         -0.198
 3       0.155       0.912      0.917     0.0141 c         -0.603
 4       0.998       0.603      0.121     0.943  d         -1.09 
 5       0.300       0.853      0.317     0.740  e          0.184
 6       0.0486      0.598      0.330     0.122  f          1.31 
 7       0.0583      0.757      0.650     0.465  g         -0.168
 8       0.819       0.461      0.165     0.597  h          0.344
 9       0.541       0.280      0.978     0.793  i          0.376
10       0.850       0.147      0.865     0.426  j         -0.195

HTH

rename multiple variables with pattern mutate_at

You can use mutate_at

library(dplyr)
data %>%  mutate_at(vars(ends_with("cents")), ~./100)

# A tibble: 3 x 4
#   col1_cents  col1 col2_cents  col2
#       <dbl> <dbl>      <dbl> <dbl>
#1         10    NA       30    NA  
#2         NA    20       NA    25.2
#3         20    NA       20.3  NA

If you then want to combine the two columns, we can use split.default to split columns based on similarity of the names, use imap_dfc from purrr along with coalesce to combine them together.

df1 <- data %>%  mutate_at(vars(ends_with("cents")), ~./100)

purrr::imap_dfc(split.default(df1, sub("_.*", "", names(df1))), 
 ~.x %>% mutate(!!.y := coalesce(.x[[2]], .x[[1]])) %>% select(.y))

#  col1  col2
#  <dbl> <dbl>
#1    10  30  
#2    20  25.2
#3    20  20.3

Dplyr: Mutate_At + Coalesce: Dynamic Names of Columns