How to make the levels of a factor in a data frame consistent across all columns?
You could change the levels of the dataset "df" to be in the same order by looping (lapply
) and convert to factor
again with the specified levels
and assign it back to the corresponding columns.
lvls <- c('PASS', 'WARN', 'FAIL')
df[] <- lapply(df, factor, levels=lvls)
str(df)
# 'data.frame': 5 obs. of 5 variables:
# $ Test1: Factor w/ 3 levels "PASS","WARN",..: 1 1 1 1 1
# $ Test2: Factor w/ 3 levels "PASS","WARN",..: 1 1 3 3 2
# $ Test3: Factor w/ 3 levels "PASS","WARN",..: 3 3 3 3 3
# $ Test4: Factor w/ 3 levels "PASS","WARN",..: 2 1 1 1 2
# $ Test5: Factor w/ 3 levels "PASS","WARN",..: 2 2 2 2 2
If you opt to use data.table
library(data.table)
setDT(df)[, names(df):= lapply(.SD, factor, levels=lvls)]
setDT
converts to "data.frame" to "data.table", assign (:=
) the column names of the dataset to the reconverted factor columns (lapply(..)
). .SD
denotes "Subset of Datatable".
data
df <- structure(list(Test1 = structure(c(1L, 1L, 1L, 1L, 1L),
.Label = "PASS", class = "factor"),
Test2 = structure(c(2L, 2L, 1L, 1L, 3L), .Label = c("FAIL",
"PASS", "WARN"), class = "factor"), Test3 = structure(c(1L,
1L, 1L, 1L, 1L), .Label = "FAIL", class = "factor"), Test4 =
structure(c(2L, 1L, 1L, 1L, 2L), .Label = c("PASS", "WARN", "FAIL"),
class = "factor"), Test5 = structure(c(1L, 1L, 1L, 1L, 1L), .Label =
"WARN", class = "factor")), .Names = c("Test1",
"Test2", "Test3", "Test4", "Test5"), row.names = c("Sample1",
"Sample2", "Sample3", "Sample4", "Sample5"), class = "data.frame")
randomForest does not work when training set has more different factor levels than test set
R expects both the training and the test data to have the exact same levels (even if one of the sets has no observations for a given level or levels). In your case, since the test dataset is missing a level that the train has, you can do
test$val <- factor(test$val, levels=levels(train$val))
to make sure it has all the same levels and they are coded the same say.
(reposted here to close out the question)
Succinct way to set multiple factor levels of different variables to same color in ggplot scatterplots
As you suggested, using scale_fill_manual
or scale_color_manual
is a valid option.
You could write a function that matches the colors between two clusterings (e.g. relative to the clusters of the first or previous clustering).
Here is one way you could match up the colors and apply that to multiple clusters sequentially:
library(ggplot2)
x <- c(.35, .35, .37, .5, .55, .56, .9, .91, .89)
y <- c(.35, .36, .35, .22, .27, .25, .88, .9, .87)
clu3 <- factor(c(31, 31, 31, 32, 32, 32, 33, 33, 33))
clu4 <- factor(c(41, 41, 41, 42, 43, 43, 44, 44, 44))
clu5 <- factor(c(51, 51, 52, 53, 54, 54, 55, 55, 55)) # added a few more clusters
clu6 <- factor(c(61, 61, 62, 63, 64, 64, 65, 66, 65))
df <- data.frame(x, y, clu3, clu4, clu5, clu6)
## assign specific colors to matching clusters; rest: use same colors
matchCol <- function(fac1, fac2, pal=c("#999999", "#E69F00", "#56B4E9",
"#009E73", "#F0E442", "#0072B2",
"#D55E00", "#CC79A7")){
maxl <- max(length(levels(fac1)), length(levels(fac2)))
if(length(pal) < maxl) { # make sure you have enough colors
warning("Not enough colors; using scales::hue_pal")
pal <- scales::hue_pal()(maxl)
}
tab <- as.matrix(table(fac1, fac2)) > 0
rs1 <- which(rowSums(tab) == 1)
rs2 <- apply(tab[rs1, , drop=FALSE], 1, which.max)
f1 <- setNames(pal[seq_along(levels(fac1))], levels(fac1))
f2 <- setNames(NA[seq_along(levels(fac2))], levels(fac2))
f2[levels(fac2)[rs2]] <- f1[levels(fac1)[rs1]] # add matching colors
f2n <- names(f2)
if(!identical(fac1, fac2)) f2n[rs2] <- paste0(levels(fac1)[rs1], " | ", levels(fac2)[rs2])
f2[is.na(f2)] <- setdiff(pal, f2)[seq_along(f2[is.na(f2)])] # fill in remaining colors
list(fac1=f1, fac2=f2, f2n=f2n ) # you only need f2 here, so could simplify
}
# then plot using matchCol function, e.g.:
ggplot(df, aes(x=x, y=y, color=clu4)) +
geom_point(size=4)+
theme_bw()+
ggtitle(paste("Four-cluster scatterplot"))+
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values=matchCol(clu3, clu4)$fac2,
labels=matchCol(clu3, clu4)$f2)
# or generalized
clusts <- grep("clu", colnames(df), value=TRUE)
p1 <- lapply(clusts, function(z){
mc <- matchCol(get(clusts[1]), get(z))
ggplot(df, aes_string(x="x", y="y", color=z)) +
geom_point(size=4)+
theme_bw()+
ggtitle(paste0(gsub("clu", "", z),"-cluster scatterplot"))+
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values=mc$fac2, labels=mc$f2)
}
)
cowplot::plot_grid(plotlist = p1)
# same, relative to previous clustering:
p2 <- lapply(seq_along(clusts), function(z){
mc <- matchCol(get(clusts[max(1, z-1)]), get(clusts[z]))
ggplot(df, aes_string(x="x", y="y", color=clusts[z])) +
geom_point(size=4)+
theme_bw()+
ggtitle(paste0(gsub("clu", "", clusts[z]),"-cluster scatterplot"))+
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values=mc$fac2, labels=mc$f2)
}
)
cowplot::plot_grid(plotlist = p2)
Created on 2020-12-17 by the reprex package (v0.3.0)
lm() looped over factor variable while dropping single-level factor variables from the model
Your model's formula is conditional on whether or not there are enough levels in each independent variable to be included.
You can create a formula based on these conditions (e.g., using ifelse()
) and then feed the formula to the model inside lapply()
.
Here is a solution:
lapply(unique(df$location), function(z) {
sub_df = dplyr::filter(df, location == z) # subset by location
form_x4 = ifelse(length(unique(sub_df$x4)) > 1, "+ x4", "")
form_x5 = ifelse(length(unique(sub_df$x5)) > 1, "+ x5", "")
form = as.formula(paste("y ~ x1 + x2 + x3", form_x4, form_x5))
return(lm(data = sub_df, formula = form))
})
The form
inside the above lapply(...)
combines the consistent part of the lm()
formula with multiple variables that meet the conditions to be used in the formula. If a variable only has a single level, the ifelse()
statement allows you to treat it as if it's not there when putting it in the formula.
Subset a dataframe by multiple factor levels
You can use %in%
data[data$Code %in% selected,]
Code Value
1 A 1
2 B 2
7 A 3
8 A 4
Related Topics
How to Install R-Packages Not in the Conda Repositories
Select Columns by Class (E.G. Numeric) from a Data.Table
Check to See If a Value Is Within a Range
Update Rows of Data Frame in R
R: Why Kable Doesn't Print Inside a for Loop
How to Always Display 3 Decimal Places in Datatables in R Shiny
Using Pivot_Longer with Multiple Paired Columns in the Wide Dataset
Create Link to the Other Part of the Shiny App
Ggplot Geom_Bar: Stack and Center
Combine Multiple .Rdata Files Containing Objects with the Same Name into One Single .Rdata File
Check If a String Contains at Least One Numeric Character in R
Using Rollmean When There Are Missing Values (Na)
List and Description of All Packages in Cran from Within R
Add Titles to Ggplots Created with Map()
How to Reverse Legend (Labels and Color) So High Value Starts at Bottom