Change Level of Multiple Factor Variables

Change the levels of multiple factors that start_with the same pattern in R

Use forcats::fct_inseq:

df <- df %>% 
mutate_all(., as.factor) %>%
mutate(col2 = fct_inseq(df$col2))

Output:

levels(df$col2)
[1] "10.01" "10.02" "10.03" "12.1" "12.2" "12.3" "100.1" "100.2" "100.3"

Reorder the levels of multiple categorical variables using a vector of variable names

Use lapply to to change factor levels in multiple columns. Also make sure that factor levels are same as in your data, otherwise it would return NA. In your attempt you are using mixed upper and lower case whereas in your data it is only lower case.

dataset[, myvars] <- lapply(dataset[, myvars], factor, 
levels=c("not at all","moderately","a little bit","a lot"))

Using dplyr :

library(dplyr)
dataset %>%
mutate(across(myvars, factor,
levels=c("not at all","moderately","a little bit","a lot")))
#In older version of dplyr use mutate_at
#mutate_at(vars(myvars), factor,
levels=c("not at all","moderately","a little bit","a lot"))

data

dataset <- data.frame(donuts,cookies,cupcakes,coffee,macarons)

How to overwrite levels of multiple factor variables in R using for loop?

In response to OP's request in comments...

library(tidyverse)

# Test data. 3 questions just to demonstrate the principle.
d <- tibble(
Participant=1:10,
Q1=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)","Completely Agree (3)")),
Q2=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)","Completely Agree (3)")),
Q3=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)","Completely Agree (3)"))
)

To recode the factors

# Recode untidy data
d %>% mutate(
across(
starts_with("Q"),
function(x) factor(as.numeric(x) == 3, labels=c("Do not completely agree (1&2)", "Completely agree (3)"))
)
)

# Tidy the data
dTidy <- d %>%
pivot_longer(
cols=starts_with("Q"),
values_to="Response",
names_to="Question"
)
dTidy

# Recode tidy data
dTidy %>%
mutate(
Response=factor(
as.numeric(Response) == 3,
labels=c("Do not completely agree (1&2)", "Completely agree (3)")
)
)

Not much difference so far. The benefit of tidy data becomes more apparent when we try to do something with it. As a simple example, plot a histogram of the questions. The untidy data isn't particularly amenable. Here's a simple summary:

# Plot untidy data
doPlots <- function(data) {
print(data %>% ggplot() + geom_bar(aes(x=Q1)))
print(data %>% ggplot() + geom_bar(aes(x=Q2)))
print(data %>% ggplot() + geom_bar(aes(x=Q3)))
}

d %>% doPlots()

Anything else is quite awkward. With tidy data, it's simple:

# Plot tidy data
dTidy %>%
ggplot() +
geom_bar(aes(x=Response)) +
facet_grid(rows=vars(Question))

# Or
dTidy %>%
ggplot() +
geom_bar(aes(x=Response, fill=Question))

In addition, suppose a different data set arrives, with more questions than the orginal.

# Now add another Question  
d <- d %>% mutate(Q4=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)", "Completely Agree (3)")))

dTidy <- dTidy %>%
bind_rows(
tibble(
Participant=1:10,
Question="Q4",
Response=factor(sample(1:3, 10, TRUE), labels=c("Do not Agree (1)", "Somewhat Agree (2)", "Completely Agree (3)"))
)
)

The doPlot function needs to be rewritten: it ignores Q4.

d %>% doPlot()

But the tidy code is robust and needs no changes

dTidy %>% 
ggplot() +
geom_bar(aes(x=Response)) +
facet_grid(rows=vars(Question))

Using tidy data, in my opinion, means your code is

  • more compact
  • easier to understand
  • more robust
  • easier to maintain
  • more flexible

Applying same factor levels to multiple variables with differing amount of levels in R

Here is a way with set() called in a for loop.

library(data.table)

f <- function(x){
x <- as.character(x)
i1 <- x %in% c("TRUE", "1")
i0 <- x %in% c("FALSE", "0")
x[which(i1)] <- "2"
x[which(i0)] <- "1"
as.integer(x)
}

for (j in seq_along(dt)) set(dt, j = j, value = f(dt[[j]]))

dt
# region1 region2 region3 region4
#1: 2 2 NA NA
#2: 1 2 1 1
#3: 1 1 2 1
#4: 2 NA NA NA
#5: NA NA 1 1

Thanks to jangorecki's comment a much simpler way is

dt[, names(dt) := lapply(dt, f)]

Creating a new factor variable from multiple factor variables, all with same levels

In dplyr you can specify the conditions in case_when :

library(dplyr)

df %>%
rowwise() %>%
mutate(result = {
vec <- c_across(f1:f3)
case_when(sum(vec %in% 1:2) >= 2 ~ 1,
sum(vec == 3) >= 2 ~ 2,
sum(vec == 4) >= 2 ~ 3,
TRUE ~ 4)
})

# id f1 f2 f3 result
# <int> <fct> <fct> <fct> <dbl>
# 1 1 4 2 1 1
# 2 2 1 1 1 1
# 3 3 4 2 2 1
# 4 4 4 3 1 4
# 5 5 2 2 1 1
# 6 6 3 4 2 4
# 7 7 4 2 4 3
# 8 8 3 2 2 1
# 9 9 3 1 1 1
#10 10 2 1 1 1

Is there a function in R to change several similar factor levels at once?

If the vectors are of same length you can put them in dataframe or if they are of different length put them in a list and then use lapply to apply the same function for all of them. You can use forcats::fct_collapse to collapse multiple levels into one.

list_vec <- list(A, B, C)

list_vec <- lapply(list_vec, function(x) forcats::fct_collapse(x,
"yes"=c("Likely", "y", "Y", "Yes", "yes"),
"no" = c("", "No", "UK", "no", "N", "n", "uk")))

Summarising levels of multiple factor variables

Doing it in two steps will give you the desired result. First, calculate the n, then calculate the percentage by group:

library(dplyr)
df.out <- df %>%
group_by(group, sex, agegroup, hiv) %>%
tally() %>%
group_by(group) %>%
mutate(percent=n/sum(n))

A solution with data.table:

library(data.table)
dt.out <- setDT(df)[, .N, by=.(group, sex, agegroup, hiv)][, percent:=N/sum(N), by=group]

library(microbenchmark)
microbenchmark(df.out = df %>%
group_by(group, sex, agegroup, hiv) %>%
tally() %>%
group_by(group) %>%
mutate(percent=n/sum(n)),
dt.out = df[,.N,by=.(group, sex, agegroup, hiv)][,percent:=N/sum(N),by=group])

# Unit: milliseconds
# expr min lq mean median uq max neval cld
# df.out 8.299870 8.518590 8.894504 8.708315 8.931459 11.964930 100 b
# dt.out 2.346632 2.394788 2.540132 2.441777 2.551235 4.344442 100 a

Conclusion: the data.table solution is much faster (3.5x).


To get a table like you requested after the edit of your question, you can do the following:

library(data.table)

setDT(df)
dt.sex <- dcast(df[,.N, by=.(sex,group)][,percent:=N/sum(N)], sex ~ group, value.var = c("N", "percent"))
dt.age <- dcast(df[,.N, by=.(agegroup,group)][,percent:=N/sum(N)], agegroup ~ group, value.var = c("N", "percent"))
dt.hiv <- dcast(df[,.N, by=.(hiv,group)][,percent:=N/sum(N)], hiv ~ group, value.var = c("N", "percent"))

dt.out.wide <- rbindlist(list(dt.sex, dt.age, dt.hiv), use.names=FALSE)
names(dt.out.wide) <- c("X","N_Intervention","N_Control","percent_Intervention","percent_Control")

this gives:

> dt.out.wide
X N_Intervention N_Control percent_Intervention percent_Control
1: Male 2454 2488 0.2454 0.2488
2: Female 2561 2497 0.2561 0.2497
3: 16-24 954 991 0.0954 0.0991
4: 25-34 1033 1002 0.1033 0.1002
5: 35-44 1051 1000 0.1051 0.1000
6: 45-54 983 978 0.0983 0.0978
7: 55+ 994 1014 0.0994 0.1014
8: Positive 1717 1664 0.1717 0.1664
9: Negative 1637 1659 0.1637 0.1659
10: Not tested 1661 1662 0.1661 0.1662


Related Topics



Leave a reply



Submit