Ignore Outliers in Ggplot2 Boxplot

Ignore outliers in ggplot2 boxplot

Here is a solution using boxplot.stats

# create a dummy data frame with outliers
df = data.frame(y = c(-100, rnorm(100), 100))

# create boxplot that includes outliers
p0 = ggplot(df, aes(y = y)) + geom_boxplot(aes(x = factor(1)))


# compute lower and upper whiskers
ylim1 = boxplot.stats(df$y)$stats[c(1, 5)]

# scale y limits based on ylim1
p1 = p0 + coord_cartesian(ylim = ylim1*1.05)

How to exclude outliers when using geom_boxplot() + geom_jitter() in R

geom_jitter() does not have argument for discarding the outliers on its own. You need to manually filter the data points to be plotted by defining which points are outliers.

library(dplyr)
library(ggplot2)

mpg %>%
group_by(drv) %>%
mutate(cty_filtered = case_when(cty - quantile(cty)[4] > 1.5*IQR(cty) ~ NA_real_,
quantile(cty)[2] - cty > 1.5*IQR(cty) ~ NA_real_,
TRUE ~ cty)) %>%
ggplot() + geom_boxplot(aes(drv, cty)) + geom_jitter(aes(drv, cty_filtered))

Remove outliers fully from multiple boxplots made with ggplot2 in R and display the boxplots in expanded format

Based on suggestions by @Sven Hohenstein, @Roland and @lukeA I have solved the problem for displaying multiple boxplots in expanded form without outliers.

First plot the box plots without outliers by using outlier.colour=NA in geom_boxplot()

plt_wool <- ggplot(subset(df_mlt, value > 0), aes(x=ID1,y=value)) + 
geom_boxplot(aes(color=factor(ID1)),outlier.colour = NA) +
scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) +
theme_bw() +
theme(legend.text=element_text(size=14), legend.title=element_text(size=14))+
theme(axis.text=element_text(size=20)) +
theme(axis.title=element_text(size=20,face="bold")) +
labs(x = "x", y = "y",colour="legend" ) +
annotation_logticks(sides = "rl") +
theme(panel.grid.minor = element_blank()) +
guides(title.hjust=0.5) +
theme(plot.margin=unit(c(0,1,0,0),"mm"))

Then compute the lower, upper whiskers using boxplot.stats() as the code below. Since I only take into account positive values, I choose them using the condition in the subset().

yp <- subset(df, x>0)             # Choosing only +ve values in col x
sts <- boxplot.stats(yp$x)$stats # Compute lower and upper whisker limits

Now to achieve full expanded view of the multiple boxplots, it is useful to modify the y-axis limit of the plot inside coord_cartesian() function as below,

p1 = plt_wool + coord_cartesian(ylim = c(sts[2]/2,max(sts)*1.05))

Note: The limits of y should be adjusted according to the specific case. In this case I have chosen half of lower whisker limit for ymin.

The resulting plot is below,

Sample Image

Remove outliers and reduce yLim appropriately for each facet in ggplot2

As noted above, you pretty much have to filter before plotting, but this doesn't need to be done by editing any files, or even by creating new dataframes. Using dplyr you can just chain this into the processing of your data. I've done a hopefully reproducible example below with some made-up data (as I don't have yours). I created a function to filter by the same procedures as the boxplot is using. It's a bit hacky, but hopefully works as one potential solution:

require(ggplot2)
require(dplyr)

data_frame <- data.frame(value = c(rnorm(2000, mean = 100, sd = 20), rnorm(2000, mean = 1000, sd = 500)),
level = c(rep(1,2000), rep(2, 2000)),
policy = factor(c(rep(c(rep(1, 500), rep(2, 500), rep(3, 500), rep(4, 500)), 2))))

# filtering function - turns outliers into NAs to be removed
filter_lims <- function(x){
l <- boxplot.stats(x)$stats[1]
u <- boxplot.stats(x)$stats[5]

for (i in 1:length(x)){
x[i] <- ifelse(x[i]>l & x[i]<u, x[i], NA)
}
return(x)
}

data_frame %>%
group_by(level, policy) %>% # do the same calcs for each box
mutate(value2 = filter_lims(value)) %>% # new variable (value2) so as not to displace first one)
ggplot(aes(x='', y=value2, fill = policy)) +
geom_boxplot(na.rm = TRUE, coef = 5) + # remove NAs, and set the whisker length to all included points
facet_wrap( ~ level, scales="free") +
xlab("") + ylab("Authorisation Time (ms)") + ggtitle("Title")

Resulting in the following (simplified) plot:

Graph from synthetic data

No outliers in ggplot boxplot with facet_wrap

Ok, I figured out a more easy way to do this by commenting out some lines in the original ggplot boxplot function and calling the modified function.

I am not a programmer, no idea if this is a good or robust thing to do, but it seems to work fine for now.

This is the modified function I am using:

#modified version of geom_boxplot

require(ggplot2)
geom_boxplot_noOutliers <- function (mapping = NULL, data = NULL, stat = "boxplot",
position = "dodge", outlier.colour = NULL,
outlier.shape = NULL, outlier.size = NULL,
notch = FALSE, notchwidth = .5, varwidth = FALSE,
...) {

#outlier_defaults <- ggplot2:::Geom$find('point')$default_aes()

#outlier.colour <- outlier.colour %||% outlier_defaults$colour
#outlier.shape <- outlier.shape %||% outlier_defaults$shape
#outlier.size <- outlier.size %||% outlier_defaults$size

GeomBoxplot_noOutliers$new(mapping = mapping, data = data, stat = stat,
position = position, outlier.colour = outlier.colour,
outlier.shape = outlier.shape, outlier.size = outlier.size, notch = notch,
notchwidth = notchwidth, varwidth = varwidth, ...)
}

GeomBoxplot_noOutliers <- proto(ggplot2:::Geom, {
objname <- "boxplot_noOutliers"

reparameterise <- function(., df, params) {
df$width <- df$width %||%
params$width %||% (resolution(df$x, FALSE) * 0.9)

# if (!is.null(df$outliers)) {
# suppressWarnings({
# out_min <- vapply(df$outliers, min, numeric(1))
# out_max <- vapply(df$outliers, max, numeric(1))
# })
#
# df$ymin_final <- pmin(out_min, df$ymin)
# df$ymax_final <- pmax(out_max, df$ymax)
# }

# if `varwidth` not requested or not available, don't use it
if (is.null(params) || is.null(params$varwidth) || !params$varwidth || is.null(df$relvarwidth)) {
df$xmin <- df$x - df$width / 2
df$xmax <- df$x + df$width / 2
} else {
# make `relvarwidth` relative to the size of the largest group
df$relvarwidth <- df$relvarwidth / max(df$relvarwidth)
df$xmin <- df$x - df$relvarwidth * df$width / 2
df$xmax <- df$x + df$relvarwidth * df$width / 2
}
df$width <- NULL
if (!is.null(df$relvarwidth)) df$relvarwidth <- NULL

df
}

draw <- function(., data, ..., fatten = 2, outlier.colour = NULL, outlier.shape = NULL, outlier.size = 2,
notch = FALSE, notchwidth = .5, varwidth = FALSE) {
common <- data.frame(
colour = data$colour,
size = data$size,
linetype = data$linetype,
fill = alpha(data$fill, data$alpha),
group = data$group,
stringsAsFactors = FALSE
)

whiskers <- data.frame(
x = data$x,
xend = data$x,
y = c(data$upper, data$lower),
yend = c(data$ymax, data$ymin),
alpha = NA,
common)

box <- data.frame(
xmin = data$xmin,
xmax = data$xmax,
ymin = data$lower,
y = data$middle,
ymax = data$upper,
ynotchlower = ifelse(notch, data$notchlower, NA),
ynotchupper = ifelse(notch, data$notchupper, NA),
notchwidth = notchwidth,
alpha = data$alpha,
common)

# if (!is.null(data$outliers) && length(data$outliers[[1]] >= 1)) {
# outliers <- data.frame(
# y = data$outliers[[1]],
# x = data$x[1],
# colour = outlier.colour %||% data$colour[1],
# shape = outlier.shape %||% data$shape[1],
# size = outlier.size %||% data$size[1],
# fill = NA,
# alpha = NA,
# stringsAsFactors = FALSE)
# outliers_grob <- GeomPoint$draw(outliers, ...)
# } else {
outliers_grob <- NULL
# }

ggname(.$my_name(), grobTree(
outliers_grob,
GeomSegment$draw(whiskers, ...),
GeomCrossbar$draw(box, fatten = fatten, ...)
))
}

guide_geom <- function(.) "boxplot_noOutliers"
draw_legend <- function(., data, ...) {
data <- aesdefaults(data, .$default_aes(), list(...))
gp <- with(data, gpar(col=colour, fill=alpha(fill, alpha), lwd=size * .pt, lty = linetype))
gTree(gp = gp, children = gList(
linesGrob(0.5, c(0.1, 0.25)),
linesGrob(0.5, c(0.75, 0.9)),
rectGrob(height=0.5, width=0.75),
linesGrob(c(0.125, 0.875), 0.5)
))
}

default_stat <- function(.) StatBoxplot
default_pos <- function(.) PositionDodge
default_aes <- function(.) aes(weight=1, colour="grey20", fill="white", size=0.5, alpha = NA, shape = 16, linetype = "solid")
required_aes <- c("x", "lower", "upper", "middle", "ymin", "ymax")

})

I saved it as an r file and use source to load it:

library(ggplot2)
library(scales)

#load functions
source("D:/Eigene Dateien/Scripte/R-Scripte/myfunctions/geomBoxplot_noOutliers.r")

Now I can just plot without outliers using geom_boxplot_noOutliers and everything works fine even with facets :-)

p1 <- ggplot(diamonds, aes(x=cut, y=price, fill=cut))
p1 + geom_boxplot_noOutliers() + facet_wrap(~clarity, scales="free")

Sample Image

How to remove outliers from this box plotting r command

If you just want to remove the outliers from your graph, you can just add the argument outlier.shape = NA to geom_boxplot()

Example:

require(tidyverse)

mtcars %>%
ggplot(aes(1, wt)) +
geom_boxplot(outlier.shape = NA)


Related Topics



Leave a reply



Submit