Change Level of Multiple Factor Variables

Change the levels of multiple factors that start_with the same pattern in R

Use forcats::fct_inseq:

df <- df %>% 
  mutate_all(., as.factor) %>% 
  mutate(col2 = fct_inseq(df$col2))

Output:

levels(df$col2)
[1] "10.01" "10.02" "10.03" "12.1"  "12.2"  "12.3"  "100.1" "100.2" "100.3"

Reorder the levels of multiple categorical variables using a vector of variable names

Use lapply to to change factor levels in multiple columns. Also make sure that factor levels are same as in your data, otherwise it would return NA. In your attempt you are using mixed upper and lower case whereas in your data it is only lower case.

dataset[, myvars] <- lapply(dataset[, myvars], factor, 
                      levels=c("not at all","moderately","a little bit","a lot"))

Using dplyr :

library(dplyr)
dataset %>%
  mutate(across(myvars, factor, 
               levels=c("not at all","moderately","a little bit","a lot")))
  #In older version of dplyr use mutate_at
  #mutate_at(vars(myvars), factor, 
               levels=c("not at all","moderately","a little bit","a lot"))

data

dataset <- data.frame(donuts,cookies,cupcakes,coffee,macarons)

How to overwrite levels of multiple factor variables in R using for loop?

In response to OP's request in comments...

library(tidyverse)

# Test data.  3 questions just to demonstrate the principle.
d <- tibble(
       Participant=1:10,
       Q1=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)","Completely Agree (3)")),
       Q2=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)","Completely Agree (3)")),
       Q3=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)","Completely Agree (3)"))
)

To recode the factors

# Recode untidy data
d %>% mutate(
        across(
          starts_with("Q"), 
          function(x) factor(as.numeric(x) == 3, labels=c("Do not completely agree (1&2)", "Completely agree (3)"))
        )
      )

# Tidy the data
dTidy <- d %>% 
           pivot_longer(
             cols=starts_with("Q"),
             values_to="Response",
             names_to="Question"
           )
dTidy

# Recode tidy data            
dTidy %>% 
   mutate(
     Response=factor(
                as.numeric(Response) == 3, 
                labels=c("Do not completely agree (1&2)", "Completely agree (3)")
              )
   )

Not much difference so far. The benefit of tidy data becomes more apparent when we try to do something with it. As a simple example, plot a histogram of the questions. The untidy data isn't particularly amenable. Here's a simple summary:

# Plot untidy data
doPlots <- function(data) {
  print(data %>% ggplot() + geom_bar(aes(x=Q1)))
  print(data %>% ggplot() + geom_bar(aes(x=Q2)))
  print(data %>% ggplot() + geom_bar(aes(x=Q3)))
}

d %>% doPlots()

Anything else is quite awkward. With tidy data, it's simple:

# Plot tidy data
dTidy %>% 
  ggplot() +
  geom_bar(aes(x=Response)) +
  facet_grid(rows=vars(Question))

# Or
dTidy %>% 
  ggplot() +
  geom_bar(aes(x=Response, fill=Question))

In addition, suppose a different data set arrives, with more questions than the orginal.

# Now add another Question  
d <- d %>% mutate(Q4=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)", "Completely Agree (3)")))

dTidy  <- dTidy %>% 
            bind_rows(
              tibble(
                Participant=1:10, 
                Question="Q4", 
                Response=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)", "Completely Agree (3)"))
              )
            )

The doPlot function needs to be rewritten: it ignores Q4.

d %>% doPlot()

But the tidy code is robust and needs no changes

dTidy %>% 
  ggplot() +
  geom_bar(aes(x=Response)) +
  facet_grid(rows=vars(Question))

Using tidy data, in my opinion, means your code is

more compact
easier to understand
more robust
easier to maintain
more flexible

Applying same factor levels to multiple variables with differing amount of levels in R

Here is a way with set() called in a for loop.

library(data.table)

f <- function(x){
  x <- as.character(x)
  i1 <- x %in% c("TRUE", "1")
  i0 <- x %in% c("FALSE", "0")
  x[which(i1)] <- "2"
  x[which(i0)] <- "1"
  as.integer(x)
}

for (j in seq_along(dt)) set(dt, j = j, value = f(dt[[j]]))

dt
#   region1 region2 region3 region4
#1:       2       2      NA      NA
#2:       1       2       1       1
#3:       1       1       2       1
#4:       2      NA      NA      NA
#5:      NA      NA       1       1

Thanks to jangorecki's comment a much simpler way is

dt[, names(dt) := lapply(dt, f)]

Creating a new factor variable from multiple factor variables, all with same levels

In dplyr you can specify the conditions in case_when :

library(dplyr)

df %>%
  rowwise() %>%
  mutate(result = {
    vec <- c_across(f1:f3)
    case_when(sum(vec %in% 1:2) >= 2 ~ 1, 
              sum(vec == 3) >= 2 ~ 2, 
              sum(vec == 4) >= 2 ~ 3, 
              TRUE ~ 4)
  })

#      id f1    f2    f3    result
#   <int> <fct> <fct> <fct>  <dbl>
# 1     1 4     2     1          1
# 2     2 1     1     1          1
# 3     3 4     2     2          1
# 4     4 4     3     1          4
# 5     5 2     2     1          1
# 6     6 3     4     2          4
# 7     7 4     2     4          3
# 8     8 3     2     2          1
# 9     9 3     1     1          1
#10    10 2     1     1          1

Is there a function in R to change several similar factor levels at once?

If the vectors are of same length you can put them in dataframe or if they are of different length put them in a list and then use lapply to apply the same function for all of them. You can use forcats::fct_collapse to collapse multiple levels into one.

list_vec <- list(A, B, C)

list_vec <- lapply(list_vec, function(x) forcats::fct_collapse(x, 
            "yes"=c("Likely", "y", "Y", "Yes", "yes"), 
            "no" = c("", "No", "UK", "no", "N", "n", "uk")))

Summarising levels of multiple factor variables

Doing it in two steps will give you the desired result. First, calculate the n, then calculate the percentage by group:

library(dplyr)
df.out <- df %>%
  group_by(group, sex, agegroup, hiv) %>%
  tally() %>%
  group_by(group) %>%
  mutate(percent=n/sum(n))

A solution with data.table:

library(data.table)
dt.out <- setDT(df)[, .N, by=.(group, sex, agegroup, hiv)][, percent:=N/sum(N), by=group]

library(microbenchmark)
microbenchmark(df.out = df %>%
                 group_by(group, sex, agegroup, hiv) %>%
                 tally() %>%
                 group_by(group) %>%
                 mutate(percent=n/sum(n)),
               dt.out = df[,.N,by=.(group, sex, agegroup, hiv)][,percent:=N/sum(N),by=group])

# Unit: milliseconds
#   expr      min       lq     mean   median       uq       max neval cld
# df.out 8.299870 8.518590 8.894504 8.708315 8.931459 11.964930   100   b
# dt.out 2.346632 2.394788 2.540132 2.441777 2.551235  4.344442   100  a

Conclusion: the data.table solution is much faster (3.5x).

To get a table like you requested after the edit of your question, you can do the following:

library(data.table)

setDT(df)
dt.sex <- dcast(df[,.N, by=.(sex,group)][,percent:=N/sum(N)], sex ~ group, value.var = c("N", "percent"))
dt.age <- dcast(df[,.N, by=.(agegroup,group)][,percent:=N/sum(N)], agegroup ~ group, value.var = c("N", "percent"))
dt.hiv <- dcast(df[,.N, by=.(hiv,group)][,percent:=N/sum(N)], hiv ~ group, value.var = c("N", "percent"))

dt.out.wide <- rbindlist(list(dt.sex, dt.age, dt.hiv), use.names=FALSE)
names(dt.out.wide) <- c("X","N_Intervention","N_Control","percent_Intervention","percent_Control")

this gives:

> dt.out.wide
             X N_Intervention N_Control percent_Intervention percent_Control
 1:       Male           2454      2488               0.2454          0.2488
 2:     Female           2561      2497               0.2561          0.2497
 3:      16-24            954       991               0.0954          0.0991
 4:      25-34           1033      1002               0.1033          0.1002
 5:      35-44           1051      1000               0.1051          0.1000
 6:      45-54            983       978               0.0983          0.0978
 7:        55+            994      1014               0.0994          0.1014
 8:   Positive           1717      1664               0.1717          0.1664
 9:   Negative           1637      1659               0.1637          0.1659
10: Not tested           1661      1662               0.1661          0.1662