Mutate Multiple Variable to Create Multiple New Variables

Mutate multiple variable to create multiple new variables

Because you are operating on column names, you need to use mutate_at rather than mutate_if which uses the values within columns

tb %>% mutate_at(vars(starts_with("y")), funs(. - z))
#> # A tibble: 3 x 5
#> x y1 y2 y3 z
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 2 4 2
#> 2 2 -2 -1 0 3
#> 3 3 5 3 1 1

To create new columns, instead of overwriting existing ones, we can give name to funs

# add suffix
tb %>% mutate_at(vars(starts_with("y")), funs(mod = . - z))
#> # A tibble: 3 x 8
#> x y1 y2 y3 z y1_mod y2_mod y3_mod
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 6 2 0 2 4
#> 2 2 1 2 3 3 -2 -1 0
#> 3 3 6 4 2 1 5 3 1

# remove suffix, add prefix
tb %>%
mutate_at(vars(starts_with("y")), funs(mod = . - z)) %>%
rename_at(vars(ends_with("_mod")), funs(paste("mod", gsub("_mod", "", .), sep = "_")))
#> # A tibble: 3 x 8
#> x y1 y2 y3 z mod_y1 mod_y2 mod_y3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 6 2 0 2 4
#> 2 2 1 2 3 3 -2 -1 0
#> 3 3 6 4 2 1 5 3 1

Edit: In dplyr 0.8.0 or higher versions, funs() will be deprecated (source1 & source2), need to use list() instead

tb %>% mutate_at(vars(starts_with("y")), list(~ . - z))
#> # A tibble: 3 x 5
#> x y1 y2 y3 z
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 2 4 2
#> 2 2 -2 -1 0 3
#> 3 3 5 3 1 1

tb %>% mutate_at(vars(starts_with("y")), list(mod = ~ . - z))
#> # A tibble: 3 x 8
#> x y1 y2 y3 z y1_mod y2_mod y3_mod
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 6 2 0 2 4
#> 2 2 1 2 3 3 -2 -1 0
#> 3 3 6 4 2 1 5 3 1

tb %>%
mutate_at(vars(starts_with("y")), list(mod = ~ . - z)) %>%
rename_at(vars(ends_with("_mod")), list(~ paste("mod", gsub("_mod", "", .), sep = "_")))
#> # A tibble: 3 x 8
#> x y1 y2 y3 z mod_y1 mod_y2 mod_y3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 6 2 0 2 4
#> 2 2 1 2 3 3 -2 -1 0
#> 3 3 6 4 2 1 5 3 1

Edit 2: dplyr 1.0.0+ has across() function which simplifies this task even further

Basic usage

across() has two primary arguments:

  • The first argument, .cols, selects the columns you want to operate on.
    It uses tidy selection (like select()) so you can pick variables by
    position, name, and type.
  • The second argument, .fns, is a function or list of functions to apply to
    each column. This can also be a purrr style formula (or list of formulas)
    like ~ .x / 2. (This argument is optional, and you can omit it if you just want
    to get the underlying data; you'll see that technique used in
    vignette("rowwise").)
# Control how the names are created with the `.names` argument which 
# takes a [glue](http://glue.tidyverse.org/) spec:
tb %>%
mutate(
across(starts_with("y"), ~ .x - z, .names = "mod_{col}")
)
#> # A tibble: 3 x 8
#> x y1 y2 y3 z mod_y1 mod_y2 mod_y3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 6 2 0 2 4
#> 2 2 1 2 3 3 -2 -1 0
#> 3 3 6 4 2 1 5 3 1

tb %>%
mutate(
across(num_range(prefix = "y", range = 1:3), ~ .x - z, .names = "mod_{col}")
)
#> # A tibble: 3 x 8
#> x y1 y2 y3 z mod_y1 mod_y2 mod_y3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 6 2 0 2 4
#> 2 2 1 2 3 3 -2 -1 0
#> 3 3 6 4 2 1 5 3 1

### Multiple functions
tb %>%
mutate(
across(c(matches("x"), contains("z")), ~ max(.x, na.rm = TRUE), .names = "max_{col}"),
across(c(y1:y3), ~ .x - z, .names = "mod_{col}")
)
#> # A tibble: 3 x 10
#> x y1 y2 y3 z max_x max_z mod_y1 mod_y2 mod_y3
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2 4 6 2 3 3 0 2 4
#> 2 2 1 2 3 3 3 3 -2 -1 0
#> 3 3 6 4 2 1 3 3 5 3 1

Created on 2018-10-29 by the reprex package (v0.2.1)

creating new variables from multiple variable using mutate() and across() in dplyr 1.0.0

You can use -

library(dplyr)

df %>%
mutate(across(starts_with("su_"),~ifelse(.x < 0, "less",
ifelse(.x > 0 & .x <= 0.5, "mid", "lots")), .names = '{col}_disc'))

# su_1 su_2 su_3 su_1_disc su_2_disc su_3_disc
#1 0.40 0.57 -0.11 mid lots less
#2 1.82 -0.55 0.44 lots less mid
#3 0.44 1.47 -0.39 mid lots less
#4 -0.82 0.00 -0.12 less lots less
#5 0.17 -0.10 -1.55 mid less less
#6 0.20 0.98 -1.02 mid lots less
#7 -0.01 1.12 -0.30 less lots less
#8 -0.70 0.31 0.35 less mid mid
#9 0.46 1.18 -0.22 mid lots less
#10 -1.09 0.03 -0.85 less mid less
#11 -0.03 1.81 1.28 less lots lots
#12 -0.11 1.64 -0.51 less lots less

You can also replace ifelse with case_when or cut.

Mutate across multiple columns to create new variable sets

This might be easier in long format, but here's an option you can pursue as wide data.

Using the latest version of dplyr you can mutate across and include .names argument to define how your want your new columns to look.

library(tidyverse)

my_col <- c("var1", "var2", "var3", "var4")

df %>%
group_by(year) %>%
mutate(across(my_col, mean, .names = "mean_{col}")) %>%
mutate(across(my_col, .names = "relmean_{col}") / across(paste0("mean_", my_col)))

Output

   year country  var1  var2  var3  var4 mean_var1 mean_var2 mean_var3 mean_var4 relmean_var1 relmean_var2 relmean_var3 relmean_var4
<int> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1910 GER 1 4 10 6 3 5 9 7.5 0.333 0.8 1.11 0.8
2 1911 GER 2 3 11 7 1.5 3.5 10.5 8 1.33 0.857 1.05 0.875
3 1910 FRA 5 6 8 9 3 5 9 7.5 1.67 1.2 0.889 1.2
4 1911 FRA 1 4 10 9 1.5 3.5 10.5 8 0.667 1.14 0.952 1.12

mutate(across) to generate multiple new columns in tidyverse

In this case, you can use cur_data() and cur_column() to take advantage that we are wanting to sum together columns that have the same suffix but just need to swap out the numbers.

library(dplyr)

df <- data.frame(
oldvar1_a = 1:3,
oldvar2_a = 4:6,
oldvar1_i = 7:9,
oldvar2_i = 10:12,
z = c(1,10,20)
)

mutate(
df,
across(
starts_with("oldvar1"),
~ (.x + cur_data()[gsub("1", "2", cur_column())]) - z,
.names = "{col}_new"
)
)
#> oldvar1_a oldvar2_a oldvar1_i oldvar2_i z oldvar2_a oldvar2_i
#> 1 1 4 7 10 1 4 16
#> 2 2 5 8 11 10 -3 9
#> 3 3 6 9 12 20 -11 1

If you want to use with case_when, just make sure to index using [[, you can read more here.

df <- data.frame(
oldvar1_a = 1:3,
oldvar2_a = 4:6,
oldvar1_i = 7:9,
oldvar2_i = 10:12,
z = c(1,2,0)
)

mutate(
df,
across(
starts_with("oldvar1"),
~ case_when(
z == 1 ~ .x,
z == 2 ~ cur_data()[[gsub("1", "2", cur_column())]],
TRUE ~ NA_integer_
),
.names = "{col}_new"
)
)
#> oldvar1_a oldvar2_a oldvar1_i oldvar2_i z oldvar1_a_new oldvar1_i_new
#> 1 1 4 7 10 1 1 7
#> 2 2 5 8 11 2 5 11
#> 3 3 6 9 12 0 NA NA

Mutate: assign multiple variables at once

You can make few changes in the function -

get_identifiers<-function(shares){
result<-c()
for(share in shares){
result<- rbind(result,c("2", "2"))
}
result <- data.frame(result)
names(result) <- c("identifier","sedol")
return (result)
}

(There are better ways to write the above code but I understand that this is just a simplified example.)

and then save the output in a list and use unnest_wider to get them in different columns.

library(dplyr)
library(tidyr)

res %>%
mutate(data = list(get_identifiers(symbol))) %>%
unnest_wider(data)

Creating new variable with dplyr::mutate based on multiple conditions and corresponding variable names passed by string vector (or tidyselect)

There are multiple ways, one option is c_across

library(dplyr) # >= 1.0.0
iris %>%
rowwise %>%
mutate(has_petal_1.4 = any(c_across(varsel) == 1.4),
width_greater_1 = all(c_across(ends_with('Width')) > 1)) %>%
ungroup
# A tibble: 150 x 7
# Sepal.Length Sepal.Width Petal.Length Petal.Width Species has_petal_1.4 width_greater_1
# <dbl> <dbl> <dbl> <dbl> <fct> <lgl> <lgl>
# 1 5.1 3.5 1.4 0.2 setosa TRUE FALSE
# 2 4.9 3 1.4 0.2 setosa TRUE FALSE
# 3 4.7 3.2 1.3 0.2 setosa FALSE FALSE
# 4 4.6 3.1 1.5 0.2 setosa FALSE FALSE
# 5 5 3.6 1.4 0.2 setosa TRUE FALSE
# 6 5.4 3.9 1.7 0.4 setosa FALSE FALSE
# 7 4.6 3.4 1.4 0.3 setosa TRUE FALSE
# 8 5 3.4 1.5 0.2 setosa FALSE FALSE
# 9 4.4 2.9 1.4 0.2 setosa TRUE FALSE
#10 4.9 3.1 1.5 0.1 setosa FALSE FALSE
# … with 140 more rows

Or a faster option with rowSums

iris %>%     
mutate(has_petal_1.4 = rowSums(select(., varsel) == 1.4) > 0,
width_greater_1 = rowSums(select(., ends_with('Width')) > 1) == 2)

Can we actually pass two sets of multiple variables into mutate across in dplyr

You can do this with get with cur_column().

library(dplyr)

df %>%
mutate(across(.cols = c(a, b),
.fns = ~case_when(
get(glue::glue("{cur_column()}_avail")) == 1 ~ .x,
get(glue::glue("{cur_column()}_avail")) == 0 ~ as.numeric(NA)
),
.names = "{.col}_new"))

# a a_avail b b_avail a_new b_new
# <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 0 1 1 1 0 1
#2 1 1 1 0 1 NA
#3 0 1 1 0 0 NA
#4 0 0 0 1 NA 0
#5 0 0 0 0 NA NA

PS - I am not sure if this should be an answer to the post that you linked.



Related Topics



Leave a reply



Submit