Consistent Factor Levels for Same Value Over Different Datasets

How to make the levels of a factor in a data frame consistent across all columns?

You could change the levels of the dataset "df" to be in the same order by looping (lapply) and convert to factor again with the specified levels and assign it back to the corresponding columns.

lvls <- c('PASS', 'WARN', 'FAIL')
df[] <-  lapply(df, factor, levels=lvls)
str(df)
# 'data.frame': 5 obs. of  5 variables:
# $ Test1: Factor w/ 3 levels "PASS","WARN",..: 1 1 1 1 1
# $ Test2: Factor w/ 3 levels "PASS","WARN",..: 1 1 3 3 2
# $ Test3: Factor w/ 3 levels "PASS","WARN",..: 3 3 3 3 3
# $ Test4: Factor w/ 3 levels "PASS","WARN",..: 2 1 1 1 2
# $ Test5: Factor w/ 3 levels "PASS","WARN",..: 2 2 2 2 2

If you opt to use data.table

library(data.table)
setDT(df)[, names(df):= lapply(.SD, factor, levels=lvls)]

setDT converts to "data.frame" to "data.table", assign (:=) the column names of the dataset to the reconverted factor columns (lapply(..)). .SD denotes "Subset of Datatable".

data

df <- structure(list(Test1 = structure(c(1L, 1L, 1L, 1L, 1L), 
.Label = "PASS", class = "factor"), 
  Test2 = structure(c(2L, 2L, 1L, 1L, 3L), .Label = c("FAIL", 
 "PASS", "WARN"), class = "factor"), Test3 = structure(c(1L, 
 1L, 1L, 1L, 1L), .Label = "FAIL", class = "factor"), Test4 = 
 structure(c(2L, 1L, 1L, 1L, 2L), .Label = c("PASS", "WARN", "FAIL"), 
 class = "factor"), Test5 = structure(c(1L, 1L, 1L, 1L, 1L), .Label = 
"WARN", class = "factor")), .Names = c("Test1", 
"Test2", "Test3", "Test4", "Test5"), row.names = c("Sample1", 
"Sample2", "Sample3", "Sample4", "Sample5"), class = "data.frame")

randomForest does not work when training set has more different factor levels than test set

R expects both the training and the test data to have the exact same levels (even if one of the sets has no observations for a given level or levels). In your case, since the test dataset is missing a level that the train has, you can do

test$val <- factor(test$val, levels=levels(train$val))

to make sure it has all the same levels and they are coded the same say.

(reposted here to close out the question)

Succinct way to set multiple factor levels of different variables to same color in ggplot scatterplots

As you suggested, using scale_fill_manual or scale_color_manual is a valid option.
You could write a function that matches the colors between two clusterings (e.g. relative to the clusters of the first or previous clustering).

Here is one way you could match up the colors and apply that to multiple clusters sequentially:

library(ggplot2)
x <- c(.35, .35, .37, .5, .55, .56, .9, .91, .89)
y <- c(.35, .36, .35, .22, .27, .25, .88, .9, .87)
clu3 <- factor(c(31, 31, 31, 32, 32, 32, 33, 33, 33))
clu4 <- factor(c(41, 41, 41, 42, 43, 43, 44, 44, 44))
clu5 <- factor(c(51, 51, 52, 53, 54, 54, 55, 55, 55)) # added a few more clusters
clu6 <- factor(c(61, 61, 62, 63, 64, 64, 65, 66, 65))
df <- data.frame(x, y, clu3, clu4, clu5, clu6)

## assign specific colors to matching clusters; rest: use same colors
matchCol <- function(fac1, fac2, pal=c("#999999", "#E69F00", "#56B4E9",
                                       "#009E73", "#F0E442", "#0072B2",
                                       "#D55E00", "#CC79A7")){
    maxl <- max(length(levels(fac1)), length(levels(fac2)))
    if(length(pal) < maxl) { # make sure you have enough colors
        warning("Not enough colors; using scales::hue_pal")
        pal <- scales::hue_pal()(maxl)
    }
    tab <- as.matrix(table(fac1, fac2)) > 0
    rs1 <- which(rowSums(tab) == 1)
    rs2 <- apply(tab[rs1, , drop=FALSE], 1, which.max)
    f1 <- setNames(pal[seq_along(levels(fac1))], levels(fac1))
    f2 <- setNames(NA[seq_along(levels(fac2))], levels(fac2))
    f2[levels(fac2)[rs2]] <- f1[levels(fac1)[rs1]]              # add matching colors
    f2n <- names(f2)
    if(!identical(fac1, fac2)) f2n[rs2] <- paste0(levels(fac1)[rs1], " | ", levels(fac2)[rs2])
    f2[is.na(f2)] <- setdiff(pal, f2)[seq_along(f2[is.na(f2)])] # fill in remaining colors
    list(fac1=f1, fac2=f2, f2n=f2n )     # you only need f2 here, so could simplify
}

# then plot using matchCol function, e.g.:
ggplot(df, aes(x=x, y=y, color=clu4)) + 
    geom_point(size=4)+
    theme_bw()+
    ggtitle(paste("Four-cluster scatterplot"))+
    theme(plot.title = element_text(hjust = 0.5)) + 
    scale_color_manual(values=matchCol(clu3, clu4)$fac2,
                       labels=matchCol(clu3, clu4)$f2)

Sample Image

# or generalized
clusts <- grep("clu", colnames(df), value=TRUE)
p1 <- lapply(clusts, function(z){
    mc <- matchCol(get(clusts[1]), get(z)) 
    ggplot(df, aes_string(x="x", y="y", color=z)) + 
        geom_point(size=4)+
        theme_bw()+
        ggtitle(paste0(gsub("clu", "", z),"-cluster scatterplot"))+
        theme(plot.title = element_text(hjust = 0.5)) + 
        scale_color_manual(values=mc$fac2, labels=mc$f2)
    }
)
cowplot::plot_grid(plotlist = p1)

Sample Image

# same, relative to previous clustering:
p2 <- lapply(seq_along(clusts), function(z){
    mc <- matchCol(get(clusts[max(1, z-1)]), get(clusts[z]))
    ggplot(df, aes_string(x="x", y="y", color=clusts[z])) + 
        geom_point(size=4)+
        theme_bw()+
        ggtitle(paste0(gsub("clu", "", clusts[z]),"-cluster scatterplot"))+
        theme(plot.title = element_text(hjust = 0.5)) + 
        scale_color_manual(values=mc$fac2, labels=mc$f2)
  }
)
    
cowplot::plot_grid(plotlist = p2)

Sample Image

^{Created on 2020-12-17 by the reprex package (v0.3.0)}

lm() looped over factor variable while dropping single-level factor variables from the model

Your model's formula is conditional on whether or not there are enough levels in each independent variable to be included.

You can create a formula based on these conditions (e.g., using ifelse()) and then feed the formula to the model inside lapply().

Here is a solution:

lapply(unique(df$location), function(z) {
    sub_df = dplyr::filter(df, location == z) # subset by location
    form_x4 = ifelse(length(unique(sub_df$x4)) > 1, "+ x4", "")
    form_x5 = ifelse(length(unique(sub_df$x5)) > 1, "+ x5", "")
    form = as.formula(paste("y ~ x1 + x2 + x3", form_x4, form_x5))
    return(lm(data = sub_df, formula = form))
})

The form inside the above lapply(...) combines the consistent part of the lm() formula with multiple variables that meet the conditions to be used in the formula. If a variable only has a single level, the ifelse() statement allows you to treat it as if it's not there when putting it in the formula.

Subset a dataframe by multiple factor levels

You can use %in%

  data[data$Code %in% selected,]
  Code Value
1    A     1
2    B     2
7    A     3
8    A     4

Consistent Factor Levels for Same Value Over Different Datasets

How to make the levels of a factor in a data frame consistent across all columns?

data

randomForest does not work when training set has more different factor levels than test set

Succinct way to set multiple factor levels of different variables to same color in ggplot scatterplots

lm() looped over factor variable while dropping single-level factor variables from the model

Subset a dataframe by multiple factor levels

Related Topics

Leave a reply