Combining Factor Level in R

Combining factor level in R

One option is recode from car

library(car)
recode(x, "c('A', 'B')='A+B';c('D', 'E') = 'D+E'")
#[1] A+B A+B A+B C D+E D+E A+B D+E C
#Levels: A+B C D+E

It should also work with dplyr

library(dplyr)
df %>%
mutate(x= recode(x, "c('A', 'B')='A+B';c('D', 'E') = 'D+E'"))
# x
#1 A+B
#2 A+B
#3 A+B
#4 C
#5 D+E
#6 D+E
#7 A+B
#8 D+E
#9 C

data

df <- data.frame(x)

Grouping 2 levels of a factor in R

Use levels(x) <- ... to specify new levels, and to combine some previous levels. For example:

f <- factor(LETTERS[c(1:3, 3:1)])
f
[1] A B C C B A
Levels: A B C

Now combine "A" and "B" into a single level:

levels(f) <- c("A", "A", "C")
f
[1] A A C C A A
Levels: A C

Combine factor variable levels with less observations together. for all factor variables in a data frame

In your current function you need to pass the threshold as a different argument

x_df[] <- lapply(x_df, Merge.factors, 0.15)
#Or to be more specific
#x_df[] <- lapply(x_df, function(x) Merge.factors(x, 0.15))

Now check

lapply(x_df, table)

#$var
# b c d Other
# 1000 1000 1000 600

#$var1
# c d e Other
# 1000 1000 1000 600

To exclude certain factors we can change the function to

Merge.factors <- function(x, p) { 
t <- table(x)
less <- subset(t, prop.table(t) < p & names(t) != 'e')
more <- subset(t, prop.table(t) >= p | names(t) == "e")
other <- rep("Other", sum(less))
new.table <- c(more, table(other))
new.x <- as.factor(rep(names(new.table), new.table))
return(new.x)
}

x_df[] <- lapply(x_df, Merge.factors, 0.15)
lapply(x_df, table)

#$var
# b c d e Other
# 1000 1000 1000 400 200

#$var1
# c d e Other
# 1000 1000 1000 600

How to relevel the factor that combines two levels with &

You could strip off the numbers and calculate the mean if there are two occurrences to get quasi-numerical suffixes.

suffix <- sapply(strsplit(trimws(gsub("\\D+", " ", levels(Sample))), " "), function(x) 
mean(as.numeric(x)))

Then, to get prefixes convert the categories into higher numbers with the right order using cat.df as an assignment matrix.

cat.df <- data.frame(c("Beginner", "intermediate", "Expert"),
(1:3)*100)
prefix <- sapply(gsub("(\\D+)\\s.*", "\\1", levels(Sample)), function(x, y)
cat.df[match(x, y), 2], cat.df[, 1])

That's all to relevel the Sample vector.

new.Sample <- factor(Sample, levels=levels(Sample)[order(prefix + suffix)])
# [1] Beginner 1 intermediate 8 intermediate 7 & 8 Expert 2
# [5] Expert 10 Beginner 3 & 4 Beginner 5 Beginner 10
# [9] intermediate 1 Expert 1 <NA>
# 10 Levels: Beginner 1 Beginner 3 & 4 Beginner 5 Beginner 10 ... Expert 10

Check

data.frame(sort(new.Sample), as.numeric(sort(new.Sample)))
# sort.new.Sample. as.numeric.sort.new.Sample..
# 1 Beginner 1 1
# 2 Beginner 3 & 4 2
# 3 Beginner 5 3
# 4 Beginner 10 4
# 5 intermediate 1 5
# 6 intermediate 7 & 8 6
# 7 intermediate 8 7
# 8 Expert 1 8
# 9 Expert 2 9
# 10 Expert 10 10

Conversion to numeric

as.numeric(new.Sample)
# [1] 1 7 6 9 10 2 3 4 5 8 NA

Data

Sample <- structure(c(1L, 10L, 9L, 7L, 6L, 3L, 4L, 2L, 8L, 5L, NA), .Label = c("Beginner 1", 
"Beginner 10", "Beginner 3 & 4", "Beginner 5", "Expert 1", "Expert 10",
"Expert 2", "intermediate 1", "intermediate 7 & 8", "intermediate 8"
), class = "factor")


Related Topics



Leave a reply



Submit