Mutate Multiple Columns in a Dataframe

Mutating multiple columns in a data frame using dplyr

You are really close.

df2 <- 
df %>%
mutate(v1v3 = v1 * v3,
v2v4 = v2 * v4)

such a beautifully simple language, right?

For more great tricks please see here.

EDIT:
Thanks to @Facottons pointer to this answer: https://stackoverflow.com/a/34377242/5088194, here is a tidy approach to resolving this issue. It keeps one from having to write a line to hard code in each new column desired. While it is a bit more verbose than the Base R approach, the logic is at least more immediately transparent/readable. It is also worth noting that there must be at least half as many rows as there are columns for this approach to work.

# prep the product column names (also acting as row numbers)
df <-
df %>%
mutate(prod_grp = paste0("v", row_number(), "v", row_number() + 2))

# converting data to tidy format and pairing columns to be multiplied together.
tidy_df <-
df %>%
gather(column, value, -prod_grp) %>%
mutate(column = as.numeric(sub("v", "", column)),
pair = column - 2) %>%
mutate(pair = if_else(pair < 1, pair + 2, pair))

# summarize the products for each column
prod_df <-
tidy_df %>%
group_by(prod_grp, pair) %>%
summarize(val = prod(value)) %>%
spread(prod_grp, val) %>%
mutate(pair = paste0("v", pair, "v", pair + 2)) %>%
rename(prod_grp = pair)

# put the original frame and summary frames together
final_df <-
df %>%
left_join(prod_df) %>%
select(-prod_grp)

Mutate multiple columns using the dplyr framework

You can use pivot_longer to have just one column to modify, which is an alternative to mutate(across()).

You can use case_when to have multiple conditions, so you do not need to nest multiple if statements. The value will be the one of the first true statement.

library(tidyverse)

apcd_hud_ex <- structure(list(studyid = 1:5, SMOKEFREE_DATE = structure(c(
16283,
16283, 16071, 16071, 16648
), class = "Date"), x2014_03_15 = c(
1,
1, 1, 0, 1
), x2014_04_15 = c(1, 1, 1, 1, 1), x2014_05_15 = c(
1,
1, 1, 1, 1
), x2014_06_15 = c(1, 1, 1, 1, 1), x2014_07_15 = c(
1,
1, 1, 1, 1
), x2014_08_15 = c(1, 1, 1, 1, 1), x2014_09_15 = c(
1,
1, 1, 1, 1
), x2014_10_15 = c(1, 1, 1, 1, 1), x2014_11_15 = c(
1,
1, 1, 1, 1
), x2014_12_15 = c(1, 1, 1, 1, 1), x2015_01_15 = c(
1,
1, 1, 1, 1
)), row.names = c(NA, -5L), class = c(
"tbl_df", "tbl",
"data.frame"
))

apcd_hud_ex %>%
pivot_longer(starts_with("x")) %>%
mutate(
insDate = name %>% str_remove("^x") %>% str_replace_all("_", "-") %>% as.Date(),
value = case_when(
value == 0 ~ 0,
insDate < SMOKEFREE_DATE ~ 1,
insDate >= SMOKEFREE_DATE ~ 2
)
) %>%
select(-insDate) %>%
pivot_wider()
#> # A tibble: 5 × 13
#> studyid SMOKEFREE_DATE x2014_03_15 x2014_04_15 x2014_05_15 x2014_06_15
#> <int> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 1 2014-08-01 1 1 1 1
#> 2 2 2014-08-01 1 1 1 1
#> 3 3 2014-01-01 2 2 2 2
#> 4 4 2014-01-01 0 2 2 2
#> 5 5 2015-08-01 1 1 1 1
#> # … with 7 more variables: x2014_07_15 <dbl>, x2014_08_15 <dbl>,
#> # x2014_09_15 <dbl>, x2014_10_15 <dbl>, x2014_11_15 <dbl>, x2014_12_15 <dbl>,
#> # x2015_01_15 <dbl>

Created on 2022-05-05 by the reprex package (v2.0.0)

Mutate across multiple columns to create new variable sets

This might be easier in long format, but here's an option you can pursue as wide data.

Using the latest version of dplyr you can mutate across and include .names argument to define how your want your new columns to look.

library(tidyverse)

my_col <- c("var1", "var2", "var3", "var4")

df %>%
group_by(year) %>%
mutate(across(my_col, mean, .names = "mean_{col}")) %>%
mutate(across(my_col, .names = "relmean_{col}") / across(paste0("mean_", my_col)))

Output

   year country  var1  var2  var3  var4 mean_var1 mean_var2 mean_var3 mean_var4 relmean_var1 relmean_var2 relmean_var3 relmean_var4
<int> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1910 GER 1 4 10 6 3 5 9 7.5 0.333 0.8 1.11 0.8
2 1911 GER 2 3 11 7 1.5 3.5 10.5 8 1.33 0.857 1.05 0.875
3 1910 FRA 5 6 8 9 3 5 9 7.5 1.67 1.2 0.889 1.2
4 1911 FRA 1 4 10 9 1.5 3.5 10.5 8 0.667 1.14 0.952 1.12

Dplyr: add multiple columns with mutate/across from character vector

The !! works for a single element

for(nm in add_cols) test <- test %>% 
mutate(!! nm := NA)

-output

> test
a col_1 col_2
1 1 NA NA
2 2 NA NA
3 3 NA NA

Or another option is

test %>% 
bind_cols(setNames(rep(list(NA), length(add_cols)), add_cols))
a col_1 col_2
1 1 NA NA
2 2 NA NA
3 3 NA NA

In base R, this is easier

test[add_cols] <- NA

Which can be used in a pipe

test %>%
`[<-`(., add_cols, value = NA)
a col_1 col_2
1 1 NA NA
2 2 NA NA
3 3 NA NA

across works only if the columns are already present i.e. it is suggesting to loop across the columns present in the data and do some modification/create new columns with .names modification


We could make use add_column from tibble

library(tibble)
library(janitor)
add_column(test, !!! add_cols) %>%
clean_names %>%
mutate(across(all_of(add_cols), ~ NA))
a col_1 col_2
1 1 NA NA
2 2 NA NA
3 3 NA NA

How to mutate multiple columns as function of multiple columns systematically?

You can do something like this with mutate(across..., however, for renaming columns there must be a shortcut.

df %>% 
mutate(across(.cols = c(varA.t1, varA.t2),
.fns = ~ .x / get(glue::glue(str_replace(cur_column(), "varA", "varB"))),
.names = "V_{.col}")) %>%
rename_with(~str_replace(., "V_varA", "varC"), starts_with("V_"))

# A tibble: 2 x 7
id varA.t1 varA.t2 varB.t1 varB.t2 varC.t1 varC.t2
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 row_1 5 10 2 4 2.5 2.5
2 row_2 20 50 4 6 5 8.33

If there is a long time series you can also create a vector for .cols beforehand.

Mutate multiple columns

We may use across to loop over multiple columns

library(dplyr)
df <- df %>%
group_by(Group) %>%
mutate(across(starts_with('Value'),
~ first(na.omit(.)) - last(na.omit(.)), .names = "{.col}_diff")) %>%
ungroup

-output

df
# A tibble: 6 × 8
Group Dates Value1 Value2 Value3 Value1_diff Value2_diff Value3_diff
<dbl> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2000-01-01 NA 0 5 -1 -1 5
2 1 2000-02-01 1 0 10 -1 -1 5
3 1 2000-03-01 2 1 0 -1 -1 5
4 2 2000-04-01 4 1 NA 2 -1 0
5 2 2000-05-01 1 2 NA 2 -1 0
6 2 2000-06-01 2 2 40 2 -1 0

Add two columns simulataneously via mutate

You can do this by having your function (or a wrapper function) return a data.frame. When you call it in mutate, don’t specify a column name (or else you’ll end up with a nested data.frame column). If you want to specify names for the new columns, you can include them as function arguments as in the below.


library(dplyr)

n <- 1e2; M <- 1e3
variance <- 1

x <- rnorm(n*M, 0, variance)
s <- rep(1:M, each = n)

dat <- data.frame(s = s, x = x)

ci_studclt <- function(x, alpha = 0.05) {
n <- length(x)
S_n <- var(x)
mean(x) + qt(c(alpha/2, 1 - alpha/2), df = n-1)*sqrt(S_n / n)
}

ci_wrapper <- function(x, alpha = 0.05, names_out = c("ci_lower", "ci_upper")) {
ci <- ci_studclt(x, alpha = alpha)
out <- data.frame(ci[[1]], ci[[2]])
names(out) <- names_out
out
}

# original code was ci_studclt(x, variance)
# but ci_studclt() doesn't take a variance argument, so I omitted
dat %>%
group_by(s) %>%
mutate(ci_wrapper(x))

output:

# A tibble: 100,000 x 4
# Groups: s [1,000]
s x ci_lower ci_upper
<int> <dbl> <dbl> <dbl>
1 1 0.233 -0.223 0.139
2 1 1.03 -0.223 0.139
3 1 1.53 -0.223 0.139
4 1 0.0150 -0.223 0.139
5 1 -0.211 -0.223 0.139
6 1 -1.13 -0.223 0.139
7 1 -1.51 -0.223 0.139
8 1 0.371 -0.223 0.139
9 1 1.80 -0.223 0.139
10 1 -0.137 -0.223 0.139
# ... with 99,990 more rows

With specified column names:

dat %>% 
group_by(s) %>%
mutate(ci_wrapper(x, names_out = c("ci.lo", "ci.hi")))

output:

# A tibble: 100,000 x 4
# Groups: s [1,000]
s x ci.lo ci.hi
<int> <dbl> <dbl> <dbl>
1 1 0.233 -0.223 0.139
2 1 1.03 -0.223 0.139
3 1 1.53 -0.223 0.139
4 1 0.0150 -0.223 0.139
5 1 -0.211 -0.223 0.139
6 1 -1.13 -0.223 0.139
7 1 -1.51 -0.223 0.139
8 1 0.371 -0.223 0.139
9 1 1.80 -0.223 0.139
10 1 -0.137 -0.223 0.139
# ... with 99,990 more rows


Related Topics



Leave a reply



Submit