New ones to install:
install.packages("ggbeeswarm")
install.packages("skimr")
install.packages("janitor")
install.packages("ggridges")
To load:
library(tidyverse)
library(ggbeeswarm)
library(skimr)
library(janitor)
library(ggridges)
Below are simulated four distributions (n = 100 each), all with similar measures of center (mean = 0) and spread (s.d. = 1), but with distinctly different shapes.1
n
);s
, Johnson distribution with skewness 2.2 and kurtosis 13);k
, Johnson distribution with skewness 0 and kurtosis 30);mm
, two normals with mean -0.95 and 0.95 and standard deviation 0.31).#install.packages("SuppDists")
library(SuppDists)
# this is used later to generate the s and k distributions
findParams <- function(mu, sigma, skew, kurt) {
value <- .C("JohnsonMomentFitR", as.double(mu), as.double(sigma),
as.double(skew), as.double(kurt - 3), gamma = double(1),
delta = double(1), xi = double(1), lambda = double(1),
type = integer(1), PACKAGE = "SuppDists")
list(gamma = value$gamma, delta = value$delta,
xi = value$xi, lambda = value$lambda,
type = c("SN", "SL", "SU", "SB")[value$type])
}
# Generate sample data -------------------------------------------------------
set.seed(141079)
# normal
n <- rnorm(100)
# right-skew
s <- rJohnson(100, findParams(0, 1, 2.2, 13))
Error in .C("JohnsonMomentFitR", as.double(mu), as.double(sigma), as.double(skew), : "JohnsonMomentFitR" not available for .C() for package "SuppDists"
# leptikurtic
k <- rJohnson(100, findParams(0, 1, 0, 30))
Error in .C("JohnsonMomentFitR", as.double(mu), as.double(sigma), as.double(skew), : "JohnsonMomentFitR" not available for .C() for package "SuppDists"
# mixture
mm <- rnorm(100, rep(c(-1, 1), each = 50) * sqrt(0.9), sqrt(0.1))
four <- data.frame(
dist = factor(rep(c("n", "s", "k", "mm"),
each = 100),
c("n", "s", "k", "mm")),
vals = c(n, s, k, mm)
)
Error in data.frame(dist = factor(rep(c("n", "s", "k", "mm"), each = 100), : object 's' not found
glimpse(four)
Error in glimpse(four): object 'four' not found
Let’s see what our descriptive statistics look like:
skim(four)
Error in is.data.frame(data): object 'four' not found
four %>%
group_by(dist) %>%
skim()
Error in eval(lhs, parent, parent): object 'four' not found
What you want to look for:
The nice thing about ggplot
is that we can use facet_wrap
, and the x- and y-axes are the same, and the size of the binwidths are also the same.
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) + #no y needed for visualization of univariate distributions
geom_histogram(fill = "white", colour = "black") + #easier to see for me
coord_cartesian(xlim = c(-5, 5)) + #use this to change x/y limits!
facet_wrap(~ dist) #this is one factor variable with 4 levels
Error in ggplot(four, aes(x = vals)): object 'four' not found
Always change the binwidths on a histogram. Sometimes the default in ggplot
works great, sometimes it does not.
Super narrow:
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .1, fill = "white", colour = "black") + #super narrow bins
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Error in ggplot(four, aes(x = vals)): object 'four' not found
Super wide:
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = 2, fill = "white", colour = "black") + #super wide bins
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Error in ggplot(four, aes(x = vals)): object 'four' not found
Just right? Pretty close to the default for this data.
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .2, fill = "white", colour = "black") +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Error in ggplot(four, aes(x = vals)): object 'four' not found
#2 x 2 histograms in ggplot
ggplot(four, aes(x = vals)) +
geom_histogram(binwidth = .2, fill = "white", colour = "black") +
geom_rug() +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Error in ggplot(four, aes(x = vals)): object 'four' not found
What you want to look for:
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot() +
scale_x_discrete(name="") +
scale_y_continuous(name="") +
coord_cartesian(ylim = c(-4,4))
Error in ggplot(four, aes(y = vals, x = dist)): object 'four' not found
ggplot notches: “Notches are used to compare groups; if the notches of two boxes do not overlap, this is strong evidence that the medians differ.” (Chambers et al., 1983, p. 62)
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(notch = T) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4,4))
Error in ggplot(four, aes(y = vals, x = dist)): object 'four' not found
Here we add a diamond for the mean (see other possible shape codes here).
ggplot(four, aes(x = dist, y = vals)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
shape = 18,
size = 4,
colour = "lightseagreen") +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
Options:
Combining geom_jitter() + stat_summary()
is the ggplot corollary to a stripchart.
ggplot(four, aes(x = dist, y = vals)) +
geom_jitter(position = position_jitter(height = 0, width = .1),
fill = "lightseagreen",
colour = "lightseagreen",
alpha = .5) +
stat_summary(fun.y = median,
fun.ymin = median,
fun.ymax = median,
geom = "crossbar",
width = 0.5) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
This is a beeswarm-like ggplot- not exactly the same, but gives you the same idea.
ggplot(four, aes(x = dist, y = vals)) +
geom_dotplot(stackdir = "center",
binaxis = "y",
binwidth = .1,
binpositions = "all",
stackratio = 1.5,
fill = "lightseagreen",
colour = "lightseagreen") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
https://github.com/eclarke/ggbeeswarm
install.packages("ggbeeswarm")
library(ggbeeswarm)
ggplot(four, aes(x = dist, y = vals)) +
geom_quasirandom(fill = "lightseagreen",
colour = "lightseagreen") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
ggplot(four, aes(x = dist, y = vals)) +
geom_quasirandom(fill = "lightseagreen",
colour = "lightseagreen",
method = "smiley") +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
Note that these recommendations do not apply if your data is “big”. You will know your data is too big if you try the below methods and you can’t see many of the individual points (typically, N > 100).
Combining geom_boxplot() + geom_dotplot()
is my personal pick for EDA when I have small - medium data (N < 100).
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(outlier.shape = NA) +
geom_dotplot(binaxis = 'y',
stackdir = 'center',
stackratio = 1.5,
binwidth = .1,
binpositions = "all",
dotsize = 1,
alpha = .75,
fill = "lightseagreen",
colour = "lightseagreen",
na.rm = TRUE) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(y = vals, x = dist)): object 'four' not found
You can also combin geom_boxplot() + geom_jitter()
. I left the outliers in to demonstrate the jittered points only go left to right because I set the jitter height = 0
.
ggplot(four, aes(y = vals, x = dist)) +
geom_boxplot(width = .5) + #left the outliers in here, so they are double-plotted
geom_jitter(fill = "lightseagreen",
colour = "lightseagreen",
na.rm = TRUE,
position = position_jitter(height = 0, width = .1),
alpha = .5) +
scale_x_discrete(name = "") +
scale_y_continuous(name = "") +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(y = vals, x = dist)): object 'four' not found
A few ways to do this:
ggplot(four, aes(x = vals)) +
geom_density(fill = "lightseagreen") +
coord_cartesian(xlim = c(-5, 5)) +
facet_wrap(~ dist)
Error in ggplot(four, aes(x = vals)): object 'four' not found
Instead of doing a facet_wrap
, I could make just one plot showing all four distributions. To make each distribution a different color, set the fill
within the aes
, and assign it to the factor variable dist
. Since now all four will be plotted on top of each other, add an alpha
level to make the color fill transparent (0 = completely transparent; 1 = completely opaque).
# Density plots with semi-transparent fill
ggplot(four, aes(x = vals, fill = dist)) +
geom_density(alpha = .5)
Error in ggplot(four, aes(x = vals, fill = dist)): object 'four' not found
These are pretty easy to make in ggplot
. However, note that the y-axis is different from if you just plotted the histogram. In fact, when interpreting this plot, the y-axis is only meaningful for reading density. It is meaningless for interpreting the histogram.
ggplot(four, aes(x = vals)) +
geom_histogram(aes(y = ..density..),
binwidth = .5,
colour = "black",
fill = "white") +
geom_density(alpha = .5, fill = "lightseagreen") +
coord_cartesian(xlim = c(-5,5)) +
facet_wrap(~ dist)
Error in ggplot(four, aes(x = vals)): object 'four' not found
My advice: always set color = NA
for geom_violin
. For fill, always set alpha
.
ggplot(four, aes(x = dist, y = vals)) +
geom_violin(color = NA,
fill = "lightseagreen",
alpha = .5,
na.rm = TRUE,
scale = "count") + # max width proportional to sample size
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
Combination geom_violin() + geom_boxplot()
is my personal pick for EDA when I have large data (N > 100).
ggplot(four, aes(x = dist, y = vals)) +
geom_boxplot(outlier.size = 2,
colour = "lightseagreen",
fill = "black",
na.rm = TRUE,
width = .1) +
geom_violin(alpha = .2,
fill = "lightseagreen",
colour = NA,
na.rm = TRUE) +
coord_cartesian(ylim = c(-4, 4))
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
Note that it is just as easy to layer a univariate scatterplot over a violin plot, just by replacing the geom_boxplot
with a different geom as shown abobe. Lots of combination plots are possible!
Using David Robinson’s code: https://gist.github.com/dgrtwo/eb7750e74997891d7c20
ggplot(four, aes(x = dist, y = vals)) +
geom_flat_violin(alpha = .5,
fill = "lightseagreen",
colour = NA,
na.rm = TRUE) +
coord_flip()
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
Typically makes the most sense to have the factor variable like dist
on the y-axis for these.
https://cran.r-project.org/web/packages/ggridges/vignettes/introduction.html
# play with scale
ggplot(four, aes(x = vals, y = dist)) +
geom_density_ridges(scale = 0.9,
fill = "lightseagreen",
alpha = .5)
Error in ggplot(four, aes(x = vals, y = dist)): object 'four' not found
https://cran.r-project.org/web/packages/ggridges/vignettes/introduction.html
ggplot(four, aes(x = vals, y = dist)) +
geom_density_ridges(jittered_points = TRUE,
position = "raincloud",
fill = "lightseagreen",
alpha = 0.7,
scale = 0.7)
Error in ggplot(four, aes(x = vals, y = dist)): object 'four' not found
The more general stat_summary
function applies a summary function to the variable mapped to y at each x value.
The simplest summary function is mean_se
, which returns the mean and the mean plus its standard error on each side. Thus, stat_summary
will calculate and plot the mean and standard errors for the y variable at each x value.
The default geom is “pointrange” which places a dot at the central y value and extends lines to the minimum and maximum y values. Other geoms you might consider to display summarized data:
geom_errorbar
geom_pointrange
geom_linerange
geom_crossbar
There are a few summary functions from the Hmisc
package which are reformatted for use in stat_summary()
. They all return aesthetics for y
, ymax
, and ymin
.
mean_cl_normal()
mean_sdl()
mean_cl_boot()
median_hilow()
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.data = "mean_se")
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.y = "mean", geom = "point") +
stat_summary(fun.y = "max", geom = "point", shape = 21)
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
ggplot(four, aes(x = dist, y = vals)) +
stat_summary(fun.data = median_hilow)
Error in ggplot(four, aes(x = dist, y = vals)): object 'four' not found
You may have noticed two different arguments that are potentially confusing: fun.data
and fun.y
. If the function returns three values, specify the function with the argument fun.data
. If the function returns one value, specify fun.y
. See below.
x <- c(1, 2, 3)
mean(x) # use fun.y
[1] 2
mean_cl_normal(x) # use fun.data
y ymin ymax
1 2 -0.4841377 4.484138
Confidence limits may give us a better idea than standard error limits of whether two means would be deemed statistically different when modeling, so we frequently use mean_cl_normal
or mean_cl_boot
in addition to mean_se
.
Using the ToothGrowth
dataset
data(ToothGrowth)
tg <- ToothGrowth
# Standard error of the mean
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_se") +
ggtitle("Mean +/- SE")
# Connect the points with lines
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_se") +
stat_summary(fun.y = mean, geom = "line") +
ggtitle("Mean +/- SE")
# Use 95% confidence interval instead of SEM
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_cl_normal") +
stat_summary(fun.y = mean, geom = "line") +
ggtitle("Mean with 95% confidence intervals")
# The errorbars overlapped, so use position_dodge to move them horizontally
pd <- position_dodge(0.1) # move them .05 to the left and right
ggplot(tg, aes(x = dose, y = len, colour = supp)) +
stat_summary(fun.data = "mean_cl_normal", position = pd) +
stat_summary(fun.y = mean, geom = "line", position = pd) +
ggtitle("Mean with 95% confidence intervals")
Not the best example for this geom, but another good one for showing variability…
# ribbon geom
ggplot(tg, aes(x = dose, y = len, colour = supp, fill = supp)) +
stat_summary(fun.y = mean, geom = "line") +
stat_summary(fun.data = mean_se, geom = "ribbon", alpha = .5)
If you must…
# Standard error of the mean; note positioning
ggplot(tg, aes(x = factor(dose), y = len, fill = supp)) +
stat_summary(fun.y = mean, geom = "bar", position = position_dodge(width = .9)) +
stat_summary(fun.data = mean_se, geom = "linerange", position = position_dodge(width = .9)) +
ggtitle("Mean +/- SE")
# Use 95% confidence interval instead of SEM
ggplot(tg, aes(x = factor(dose), y = len, fill = supp)) +
stat_summary(fun.y = mean, geom = "bar", position = position_dodge(width = .9)) +
stat_summary(fun.data = mean_cl_boot, geom = "linerange", position = position_dodge(width = .9)) +
ggtitle("Mean with 95% confidence intervals")
More help here: http://www.cookbook-r.com/Graphs/Plotting_means_and_error_bars_(ggplot2)/
The code for these distributions comes from Hadley Wickham’s paper on “40 years of boxplots”↩︎