dplyr: mutate_at + coalesce: dynamic names of columns
We can split
the dataset into a list
of data.frames after removing the substring of column names ("_extra"
), then with map
loop through the list
, coalesce
the column and then bind
with the "_extra" columns in the original dataset
library(tidyverse)
data_example %>%
split.default(str_remove(names(.), "_extra")) %>%
map_df(~ coalesce(!!! .x)) %>%
#or use
# map_df(reduce, coalesce) %>%
bind_cols(., select(data_example, ends_with("extra")))
# A tibble: 3 x 5
# aa bb cc aa_extra bb_extra
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 6 2 1
#2 2 2 7 2 2
#3 NA 2 8 NA 3
Dynamic variables names in dplyr function across multiple columns
We could use .names
in across
to rename
mean_fun_multicols <- function(data, group_cols, summary_cols) {
data %>%
group_by(across({{group_cols}})) %>%
summarise(across({{ summary_cols }},
~ mean(., na.rm = TRUE), .names = "mean_{.col}"), .groups = "drop")
}
-testing
mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt))
# A tibble: 8 × 4
cyl gear mean_mpg mean_wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37
NOTE: The :=
is mainly used when there is a single column in tidyverse
If we use the OP's function, we are assigning multiple columns to a single column and this returns a tibble
instead of a normal column. We may need to unpack
library(tidyr)
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>% str
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
grouped_df [8 × 3] (S3: grouped_df/tbl_df/tbl/data.frame)
$ cyl : num [1:8] 4 4 4 6 6 6 8 8
$ gear : num [1:8] 3 4 5 3 4 5 3 5
$ mean_c(mpg, wt): tibble [8 × 2] (S3: tbl_df/tbl/data.frame)
..$ mpg: num [1:8] 21.5 26.9 28.2 19.8 19.8 ...
..$ wt : num [1:8] 2.46 2.38 1.83 3.34 3.09 ...
- attr(*, "groups")= tibble [3 × 2] (S3: tbl_df/tbl/data.frame)
..$ cyl : num [1:3] 4 6 8
..$ .rows: list<int> [1:3]
.. ..$ : int [1:3] 1 2 3
.. ..$ : int [1:3] 4 5 6
.. ..$ : int [1:2] 7 8
.. ..@ ptype: int(0)
..- attr(*, ".drop")= logi TRUE
> mean_fun_multicols(mtcars, c(cyl, gear), c(mpg, wt)) %>%
unpack(where(is_tibble))
`summarise()` has grouped output by 'cyl'. You can override using the `.groups` argument.
# A tibble: 8 × 4
# Groups: cyl [3]
cyl gear mpg wt
<dbl> <dbl> <dbl> <dbl>
1 4 3 21.5 2.46
2 4 4 26.9 2.38
3 4 5 28.2 1.83
4 6 3 19.8 3.34
5 6 4 19.8 3.09
6 6 5 19.7 2.77
7 8 3 15.0 4.10
8 8 5 15.4 3.37
creating and accessing dynamic column names within dplyr functions
Use across
with the .names
argument or if foo_cnt, etc. with an underscore is ok then just omit the .names
argument since that is the default.
library(dplyr)
library(tibble)
do.some.stuff.2 <- function(data, col) {
cnt <- function(x) cumsum(!is.na(x))
mx <- function(x) cummax(cumsum(x))
mu <- function(x) cumsum(x) / cnt(x)
data %>%
select(date, {{col}}) %>%
filter(!is.na(date) & !is.na({{col}})) %>%
mutate(across({{col}}, lst(cnt, sum=cumsum, max=mx, mu), .names = "{.col}.{.fn}" ))
}
# test
do.some.stuff.2(example, foo)
giving:
# A tibble: 6 x 6
date foo foo.cnt foo.sum foo.max foo.mu
<date> <dbl> <int> <dbl> <dbl> <dbl>
1 2021-02-11 -0.000202 1 -0.000202 -0.000202 -0.000202
2 2021-02-12 0.363 2 0.363 0.363 0.181
3 2021-02-13 1.27 3 1.63 1.63 0.543
4 2021-02-14 1.50 4 3.13 3.13 0.781
5 2021-02-15 1.00 5 4.13 4.13 0.826
6 2021-02-16 -0.458 6 3.67 4.13 0.612
dplyr mutate_at but using any instead of all
mutate_at
has been deprecated, across()
is now preferred which works with the any_of()
and all_of()
helper functions (and other select helpers).
df |> mutate(across(any_of(col_names), as.numeric))
See the ?across
help page for more options and examples.
Pass a string as variable name in dplyr::coalesce
An option would be to convert the strings to symbols (syms
from rlang
) and then evaluate (!!!
)
library(dplyr)
tb %>%
mutate(combined = coalesce(!!! rlang::syms(uCols)))
# A tibble: 5 x 4
# a b c combined
# <chr> <chr> <chr> <chr>
#1 a <NA> c a
#2 <NA> b c b
#3 a <NA> c a
#4 <NA> <NA> c c
#5 a <NA> <NA> a
Or another option is do.call
tb %>%
mutate(combined = select(., uCols) %>%
do.call(coalesce, .))
Coalesce pairs of variables within a dataframe based on a regular expression
You could use transmute, e.g.
library(dplyr)
df <- data.frame(
A_1 = c(NA, NA, 3, 4, 5),
A_2 = c(1, 2, NA, NA, NA),
B_1 = c(NA, NA, 13, 14, 15),
B_2 = c(11, 12, NA, NA, NA)
)
df %>%
transmute(A = coalesce(A_1, A_2),
B = coalesce(B_1, B_2))
#> A B
#> 1 1 11
#> 2 2 12
#> 3 3 13
#> 4 4 14
#> 5 5 15
Created on 2021-12-22 by the reprex package (v2.0.1)
Another option, if you have lots of "A_*" and "B_*" columns (source: Romain François, user: @Romain Francois):
library(dplyr)
df <- data.frame(
A_1 = c(NA, NA, 3, 4, 5),
A_2 = c(1, 2, NA, NA, NA),
B_1 = c(NA, NA, 13, 14, 15),
B_2 = c(11, 12, NA, NA, NA)
)
coacross <- function(...) {
coalesce(!!!across(...))
}
df %>%
transmute(A = coacross(starts_with("A_")),
B = coacross(starts_with("B_")))
#> A B
#> 1 1 11
#> 2 2 12
#> 3 3 13
#> 4 4 14
#> 5 5 15
Created on 2021-12-22 by the reprex package (v2.0.1)
Edit
Based on your updated question, you don't have lots of "A_*" or "B_*" columns, but instead lots of "*_1", "*_2", and "*_3" columns. I think this is the most straightforward solution for your use-case:
library(dplyr)
df <- data.frame(Al_TAC4.25.275 = c(1, 1, 1, NA, NA, NA),
Al_TAC4.25.276 = c(NA, NA, 2, 2, 2, NA),
Al_TAC4.25.277 = c(NA, NA, 3, NA, NA, 3),
Au_TAC4.25.275 = c(1, 1, 1, NA, NA, NA),
Au_TAC4.25.276 = c(NA, NA, 2, 2, 2, NA),
Au_TAC4.25.277 = c(NA, NA, 3, NA, NA, NA),
Ar_TAC4.25.275 = c(1, 1, 1, NA, NA, 1),
Ar_TAC4.25.276 = c(NA, NA, 2, 2, 2, 2),
Ar_TAC4.25.277 = c(NA, NA, 3, NA, NA, 3))
df
#> Al_TAC4.25.275 Al_TAC4.25.276 Al_TAC4.25.277 Au_TAC4.25.275 Au_TAC4.25.276
#> 1 1 NA NA 1 NA
#> 2 1 NA NA 1 NA
#> 3 1 2 3 1 2
#> 4 NA 2 NA NA 2
#> 5 NA 2 NA NA 2
#> 6 NA NA 3 NA NA
#> Au_TAC4.25.277 Ar_TAC4.25.275 Ar_TAC4.25.276 Ar_TAC4.25.277
#> 1 NA 1 NA NA
#> 2 NA 1 NA NA
#> 3 3 1 2 3
#> 4 NA NA 2 NA
#> 5 NA NA 2 NA
#> 6 NA 1 2 3
names(df) %>%
split(str_extract(., '[:alpha:]+')) %>%
map_dfc(~ coalesce(!!!df[.x][c(1,2,3)]))
#> # A tibble: 6 × 3
#> Al Ar Au
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 1
#> 3 1 1 1
#> 4 2 2 2
#> 5 2 2 2
#> 6 3 1 NA
# change the order of the list to change the 'priority'
names(df) %>%
split(str_extract(., '[:alpha:]+')) %>%
map_dfc(~ coalesce(!!!df[.x][c(3,2,1)]))
#> # A tibble: 6 × 3
#> Al Ar Au
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 1
#> 3 3 3 3
#> 4 2 2 2
#> 5 2 2 2
#> 6 3 3 NA
names(df) %>%
split(str_extract(., '[:alpha:]+')) %>%
map_dfc(~ coalesce(!!!df[.x][c(2,1,3)]))
#> # A tibble: 6 × 3
#> Al Ar Au
#> <dbl> <dbl> <dbl>
#> 1 1 1 1
#> 2 1 1 1
#> 3 2 2 2
#> 4 2 2 2
#> 5 2 2 2
#> 6 3 2 NA
Created on 2021-12-22 by the reprex package (v2.0.1)
Keeping original variables with dplyr::mutate_at & varying length of dynamic variables
EDIT
Since , the strings are there in a variable, we cannot hardcode it.
varlist <- c('helloo', 'ooooHH')
we can modify the function to rename varlist
object instead.
dataframe %>%
mutate_at(
vars(varlist),
.funs = funs(cat = ntile(., 2))
) %>%
rename_at(vars(grep("^cat$", names(.))),
funs(sub("cat", paste0(varlist, "_cat"), .)))
Original Answer
A hackish way from the same link would be using rename_at
to replace only when we find an exact match for "cat"
library(dplyr)
dataframe %>%
mutate_at(
vars('ooooHH'),
.funs = funs(cat = ntile(., 2))
) %>%
rename_at(vars(grep("^cat$", names(.))), funs(sub("cat", "ooooHH_cat", .)))
# helloo ooooHH ahaaa ooooHH_cat
# <dbl> <dbl> <dbl> <int>
#1 1.00 1.00 200 1
#2 2.00 1.00 400 1
#3 3.00 1.00 120 1
#4 4.00 2.00 300 2
#5 5.00 2.00 100 2
#6 6.00 2.00 100 2
It would not impact when there are more than one column and the renaming is already applied.
dataframe %>%
mutate_at(
vars(contains("oo")),
.funs = funs(cat = ntile(., 2))
) %>%
rename_at(vars(grep("^cat$", names(.))), funs(sub("cat", "ooh_cat", .)))
# helloo ooooHH ahaaa helloo_cat ooooHH_cat
# <dbl> <dbl> <dbl> <int> <int>
#1 1.00 1.00 200 1 1
#2 2.00 1.00 400 1 1
#3 3.00 1.00 120 1 1
#4 4.00 2.00 300 2 2
#5 5.00 2.00 100 2 2
#6 6.00 2.00 100 2 2
How to coalesce chunks of columns based on a prefix pattern
Just extending the previous answer to remove the hard-coded column names and wrap a solution in a function:
transform_df <- function(start_df_, sep = '__'){
# identify columns of interest
spread_names <- grep(sep, colnames(start_df_), value = T)
other_names <- setdiff(colnames(start_df_), spread_names)
# grab the `key` part from their names
spread_keys <- unique(
sapply(
strsplit(spread_names, sep)
, `[[`
, 2
)
)
# apply the describe operation
for(key in spread_keys){
start_df_ <- start_df_ %>%
dplyr::mutate(!!key := coalesce(!!!select(., contains(key))))
}
# format and return
start_df_ %>%
dplyr::select(c(spread_keys, other_names))
}
and then you get the output
set.seed(2022)
my_df <- generate_data(n_of_chunks = 4, n_variants_in_each_chunk = 4, nrows = 10)
my_df %>% transform_df()
# A tibble: 10 x 6
`098Wb7lVaq` oe7XBd42Mk TdFPA4qYnl qZQ8AvXzTl col_foo col_blah
<dbl> <dbl> <dbl> <dbl> <chr> <dbl>
1 0.0846 0.746 0.381 0.182 a -0.583
2 0.509 0.492 0.962 0.702 b -0.198
3 0.155 0.912 0.917 0.0141 c -0.603
4 0.998 0.603 0.121 0.943 d -1.09
5 0.300 0.853 0.317 0.740 e 0.184
6 0.0486 0.598 0.330 0.122 f 1.31
7 0.0583 0.757 0.650 0.465 g -0.168
8 0.819 0.461 0.165 0.597 h 0.344
9 0.541 0.280 0.978 0.793 i 0.376
10 0.850 0.147 0.865 0.426 j -0.195
HTH
rename multiple variables with pattern mutate_at
You can use mutate_at
library(dplyr)
data %>% mutate_at(vars(ends_with("cents")), ~./100)
# A tibble: 3 x 4
# col1_cents col1 col2_cents col2
# <dbl> <dbl> <dbl> <dbl>
#1 10 NA 30 NA
#2 NA 20 NA 25.2
#3 20 NA 20.3 NA
If you then want to combine the two columns, we can use split.default
to split columns based on similarity of the names, use imap_dfc
from purrr
along with coalesce
to combine them together.
df1 <- data %>% mutate_at(vars(ends_with("cents")), ~./100)
purrr::imap_dfc(split.default(df1, sub("_.*", "", names(df1))),
~.x %>% mutate(!!.y := coalesce(.x[[2]], .x[[1]])) %>% select(.y))
# col1 col2
# <dbl> <dbl>
#1 10 30
#2 20 25.2
#3 20 20.3
Related Topics
Legend Venn Diagram in Venneuler
Print the Sourced R File to an Appendix Using Sweave
Predict.Svm Does Not Predict New Data
Let Each Plot in Facet_Grid Have Its Own Y-Axis Value
How Does Settimelimit Work in R
From [Package] Import [Function] in R
Obtaining Percent Scales Reflective of Individual Facets with Ggplot2
Calculating Prediction Accuracy of a Tree Using Rpart's Predict Method
Outputing N Tables in Shiny, Where N Depends on the Data
Does R-Server or Shiny Server Create a New R Process/Instance for Each User
Ggplot2: Dashed Line in Legend
Arrow() in Ggplot2 No Longer Supported
How to Do Str_Extract with Base R
Build Word Co-Occurence Edge List in R
Error: X Must Be Atomic for 'Sort.List'
Dplyr: Grouping and Summarizing/Mutating Data with Rolling Time Windows