Some example data science plots in R using ggplot2
. See https://github.com/WinVector/WVPlots for code/details.
set.seed(34903490)
= rnorm(50)
x = 0.5*x^2 + 2*x + rnorm(length(x))
y = data.frame(
frm x = x,
y = y,
yC = y>=as.numeric(quantile(y,probs=0.8)),
stringsAsFactors = FALSE)
$absY <- abs(frm$y)
frm$posY = frm$y > 0 frm
Scatterplot with smoothing line through points.
::ScatterHist(frm, "x", "y", title="Example Fit") WVPlots
Scatterplot with best linear fit through points. Also report the R-squared and significance of the linear fit.
::ScatterHist(frm, "x", "y", smoothmethod="lm",
WVPlotstitle="Example Linear Fit", estimate_sig = TRUE)
Scatterplot compared to the line x = y
. Also report the coefficient of determination between x
and y
(where y
is “true outcome” and x
is “predicted outcome”).
::ScatterHist(frm, "x", "y", smoothmethod="identity",
WVPlotstitle="Example Relation Plot", estimate_sig = TRUE)
Scatterplot of (x, y) color-coded by category/group, with marginal distributions of x and y conditioned on group.
set.seed(34903490)
= data.frame(
fmScatterHistC x=rnorm(50),
y=rnorm(50),
stringsAsFactors = FALSE)
$cat <- fmScatterHistC$x+fmScatterHistC$y>0
fmScatterHistC::ScatterHistC(fmScatterHistC, "x", "y", "cat", title="Example Conditional Distribution") WVPlots
Scatterplot of (x, y) color-coded by discretized z. The continuous variable z is binned into three groups, and then plotted as by ScatterHistC
set.seed(34903490)
= data.frame(
frmScatterHistN x=rnorm(50),
y=rnorm(50),
stringsAsFactors = FALSE)
$z <- frmScatterHistN$x+frmScatterHistN$y
frmScatterHistN::ScatterHistN(frmScatterHistN, "x", "y", "z", title="Example Joint Distribution") WVPlots
Plot the relationship y as a function of x with a smoothing curve that estimates \(E[y | x]\). If y is a 0/1 variable as below (binary classification, where 1 is the target class), then the smoothing curve estimates \(P(y | x)\). Since \(y \in \{0,1\}\) with \(y\) intended to be monotone in \(x\) is the most common use of this graph, BinaryYScatterPlot
uses a glm
smoother by default (use_glm=TRUE
, this is essentially Platt scaling), as the best estimate of \(P(y | x)\).
::BinaryYScatterPlot(frm, "x", "posY", use_glm=FALSE,
WVPlotstitle="Example 'Probability of Y' Plot (ggplot2 smoothing)")
::BinaryYScatterPlot(frm, "x", "posY", use_glm=TRUE,
WVPlotstitle="Example 'Probability of Y' Plot (GLM smoothing)")
if(requireNamespace("hexbin", quietly = TRUE)) {
set.seed(5353636)
= rbind(data.frame(x=rnorm(1000, mean = 1),
df y=rnorm(1000, mean = 1, sd = 0.5 ),
stringsAsFactors = FALSE),
data.frame(x = rnorm(1000, mean = -1, sd = 0.5),
y = rnorm(1000, mean = -1, sd = 0.5),
stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
print(WVPlots::HexBinPlot(df, "x", "y", "Two gaussians"))
}
set.seed(34903490)
= abs(rnorm(20)) + 0.1
y = abs(y + 0.5*rnorm(20))
x
= data.frame(
frm model=x,
value=y,
stringsAsFactors = FALSE)
$costs=1
frm$costs[1]=5
frm$rate = with(frm, value/costs)
frm
$isValuable = (frm$value >= as.numeric(quantile(frm$value, probs=0.8))) frm
Basic curve: each item “costs” the same. The wizard sorts by true value, the x axis sorts by the model, and plots the fraction of the total population.
::GainCurvePlot(frm, "model", "value", title="Example Continuous Gain Curve") WVPlots
We can annotate a point of the model at a specific x value
= 0.10 # get the top 10% most valuable points as sorted by the model
gainx
# make a function to calculate the label for the annotated point
= function(gx, gy) {
labelfun = gx*100
pctx = gy*100
pcty
paste("The top ", pctx, "% most valuable points by the model\n",
"are ", pcty, "% of total actual value", sep='')
}
::GainCurvePlotWithNotation(frm, "model", "value",
WVPlotstitle="Example Gain Curve with annotation",
gainx=gainx,labelfun=labelfun)
When the x
values have different costs, take that into account in the gain curve. The wizard now sorts by value/cost, and the x axis is sorted by the model, but plots the fraction of total cost, rather than total count.
::GainCurvePlotC(frm, "model", "costs", "value", title="Example Continuous Gain CurveC") WVPlots
set.seed(34903490)
# data with two different regimes of behavior
<- rbind(
frm data.frame(
model = rnorm(1000),
isValuable = sample(c(TRUE, FALSE), prob = c(0.02, 0.98), size = 1000, replace = TRUE)),
data.frame(
model = rnorm(200) + 5,
isValuable = sample(c(TRUE, FALSE), size = 200, replace = TRUE))
)
::ROCPlot(frm, "model", "isValuable", TRUE, title="Example ROC plot") WVPlots
Plotting the ROC of two models on the same data, where predictions and true outcome all in the same data frame.
set.seed(34903490)
= rnorm(50)
x1 = rnorm(length(x1))
x2 = 0.2*x2^2 + 0.5*x2 + x1 + rnorm(length(x1))
y = data.frame(
frmP x1=x1,
x2=x2,
yC = y>=as.numeric(quantile(y,probs=0.8)),
stringsAsFactors = FALSE)
# WVPlots::ROCPlot(frmP, "x1", "yC", TRUE, title="Example ROC plot")
# WVPlots::ROCPlot(frmP, "x2", "yC", TRUE, title="Example ROC plot")
::ROCPlotPair(frmP, "x1", "x2", "yC", TRUE, title="Example ROC pair plot") WVPlots
Plotting the results from two data sets, for example the results of a model on training and test sets, where predictions/outcome for the two data sets are in different data frames.
set.seed(2342458)
<- function(nrows) {
make_data <- data.frame(x = rnorm(nrows))
d 'y'] = sin(d['x']) + 0.25*rnorm(n = nrows)
d['x2'] = rnorm(n = nrows)
d['yc'] = d[['y']]>0.5
d[return(d)
}
<- make_data(500)
training <- make_data(200)
test
<- glm(yc ~ x + x2, data=training, family=binomial)
model
$pred <- predict(model, newdata=training, type="response")
training$pred <- predict(model, newdata=test, type="response")
test
::ROCPlotPair2(nm1 = "Training", # model 1
WVPlotsframe1 = training,
xvar1 = "pred", truthVar1 = "yc", truthTarget1 = TRUE,
nm2 ="Test", # model 2
frame2 = test,
xvar2 = "pred", truthVar2 = "yc", truthTarget2 = TRUE,
title = "Model performance, training vs test",
estimate_sig = FALSE)
Many ROC
plots on the same graph.
set.seed(34903490)
= rnorm(50)
x1 = rnorm(length(x1))
x2 = rnorm(length(x1))
x3 = 0.2*x2^2 + 0.5*x2 + x1 + rnorm(length(x1))
y = data.frame(
frm_m x1 = x1,
x2 = x2,
x3 = x3,
yC = y >= as.numeric(quantile(y,probs=0.8)))
::ROCPlotPairList(
WVPlotsframe = frm_m,
xvar_names = c("x1", "x2", "x3"),
truthVar = "yC", truthTarget = TRUE,
title = "Example ROC list plot")
Plots precision and recall as functions of different classifier thresholds.
::PRTPlot(frm, "model", "isValuable", TRUE, title="Example Precision-Recall plot") WVPlots
ThresholdPlot()
plots a variety of functions of different classifier thresholds.
# replicate PRTPlot. Looks a little different because ThresholdPlot does different smoothing
::ThresholdPlot(frm, "model", "isValuable", title="Reproduce PRTPlot",
WVPlotstruth_target=TRUE, # default
metrics = c("precision", "recall"))
## Warning: Removed 1 row(s) containing missing values (geom_path).
# default: sensitivity/specificity
::ThresholdPlot(frm, "model", "isValuable",
WVPlotstitle="Sensitivity and Specificity as a Function of Threshold")
One useful application of ThresholdPlot
is to “unroll” an ROC plot: if the ROC shows that your model can meet an acceptable trade-off of true positive rate and false positive rate, then ThresholdPlot
can tell you which threshold achieves that goal.
::ThresholdPlot(frm, "model", "isValuable", title="ROC 'unrolled'",
WVPlotsmetrics = c("true_positive_rate", "false_positive_rate"))
An extended example can be found here.
::DoubleDensityPlot(frm, "model", "isValuable", title="Example double density plot") WVPlots
::DoubleHistogramPlot(frm, "model", "isValuable", title="Example double histogram plot") WVPlots
set.seed(34903490)
# discrete variable: letters of the alphabet
# frequencies of letters in English
# source: http://en.algoritmy.net/article/40379/Letter-frequency-English
= c(8.167, 1.492, 2.782, 4.253, 12.702, 2.228,
letterFreqs 2.015, 6.094, 6.966, 0.153, 0.772, 4.025, 2.406, 6.749, 7.507, 1.929,
0.095, 5.987, 6.327, 9.056, 2.758, 0.978, 2.360, 0.150, 1.974, 0.074)
= letterFreqs/100
letterFreqs = data.frame(
letterFrame letter = letters,
freq=letterFreqs,
stringsAsFactors = FALSE)
# now let's generate letters according to their letter frequencies
= 1000
N = data.frame(
randomDraws draw=1:N,
letter=sample(letterFrame$letter, size=N, replace=TRUE, prob=letterFrame$freq),
stringsAsFactors = FALSE)
::ClevelandDotPlot(randomDraws, "letter", title = "Example Cleveland-style dot plot") WVPlots
::ClevelandDotPlot(randomDraws, "letter", limit_n = 10, title = "Top 10 most frequent letters") WVPlots
::ClevelandDotPlot(randomDraws, "letter", sort=0, title="Example Cleveland-style dot plot, unsorted") WVPlots
::ClevelandDotPlot(randomDraws, "letter", sort=1, stem=FALSE, title="Example with increasing sort order + coord_flip, no stem") + ggplot2::coord_flip() WVPlots
ClevelandDotPlot
also accepts an integral x variable. You probably want sort = 0
in this case.
set.seed(34903490)
= 1000
N = 0:5
ncar_vec = c(1.5, 3, 3.5, 2, 1, 0.75); prob = prob/sum(prob)
prob
= data.frame(
df num_cars = sample(ncar_vec, size = N, replace = TRUE, prob=prob),
stringsAsFactors = FALSE)
::ClevelandDotPlot(df, "num_cars", sort = 0, title = "Distribution of household vehicle ownership") WVPlots
Plot a bar chart of row counts conditioned on the categorical variable condvar
, faceted on a second categorical variable, refinevar
. Each faceted plot also shows a “shadow plot” of the totals conditioned on condvar
alone.
This plot enables comparisons of sub-population totals across both condvar
and refinevar
simultaneously.
set.seed(354534)
= 100
N
# rough proportions of eye colors
= c(0.37, 0.36, 0.16, 0.11)
eprobs
= sample(c("Brown", "Blue", "Hazel", "Green"), size = N, replace = TRUE, prob = eprobs)
eye_color = sample(c("Male", "Female"), size = N, replace = TRUE)
sex
# A data frame of eye color by sex
= data.frame(
dframe eye_color = eye_color,
sex = sex,
stringsAsFactors = FALSE)
::ShadowPlot(dframe, "eye_color", "sex", title = "Shadow plot of eye colors by sex") WVPlots
Plot a histogram of a continuous variable xvar
, faceted on a categorical conditioning variable, condvar
. Each faceted plot also shows a “shadow plot” of the unconditioned histogram for comparison.
set.seed(354534)
= 100
N
= data.frame(
dframe x = rnorm(N),
gp = "region 2",
stringsAsFactors = FALSE)
$gp = with(dframe, ifelse(x < -0.5, "region 1",
dframeifelse(x > 0.5, "region 3", gp)))
::ShadowHist(dframe, "x", "gp", title = "X values by region") WVPlots
ShadowHist
uses the Brewer Dark2 palette by default. You can pass in another Brewer palette to change the color scheme. If you prefer all the histograms to be the same color, set monochrome=TRUE
.
::ShadowHist(dframe, "x", "gp", title = "X values by region", monochrome=TRUE) WVPlots
To use a non-Brewer palette, such as viridis, or a manual color map, set palette=NULL
. Here’s an example of setting the color palette manually.
= c("#1F968BFF", "#29AF7FFF", "#55C667FF")
colormap
::ShadowHist(dframe, "x", "gp", title = "X values by region", palette=NULL) +
WVPlots::scale_fill_manual(values=colormap) ggplot2
= c("a", "b", "c")
classes = c(2, 4, 3)
means names(means) = classes
= sample(classes, size=1000, replace=TRUE)
label = means[label] + rnorm(1000)
meas = data.frame(label=label,
frm2 meas = meas,
stringsAsFactors = FALSE)
::ScatterBoxPlot(frm2, "label", "meas", pt_alpha=0.2, title="Example Scatter/Box plot") WVPlots
::ScatterBoxPlotH(frm2, "meas", "label", pt_alpha=0.2, title="Example Scatter/Box plot") WVPlots
= data.frame(x = rbinom(1000, 20, 0.5),
frmx stringsAsFactors = FALSE)
::DiscreteDistribution(frmx, "x","Discrete example") WVPlots
set.seed(52523)
<- data.frame(wt=100*rnorm(100),
d stringsAsFactors = FALSE)
::PlotDistCountNormal(d,'wt','example') WVPlots
::PlotDistDensityNormal(d,'wt','example') WVPlots
Compare to a binomial with the same success rate as the observed data
set.seed(13951)
= 20 # one trial is 20 flips
trial_size = 100 # run 100 trials
ntrial = 0.4 # true heads probability
true_frate = data.frame(n_heads = rbinom(ntrial, trial_size, true_frate),
fdata stringsAsFactors = FALSE)
= paste("Distribution of head counts, trial size =", trial_size)
title # compare to empirical p
::PlotDistCountBinomial(fdata, "n_heads", trial_size, title) WVPlots
Compare to a binomial with a specified success rate
# compare to theoretical p of 0.5
::PlotDistCountBinomial(fdata, "n_heads", trial_size, title,
WVPlotsp = 0.5)
set.seed(349521)
= 100 # number of cohorts
N = 0.15 # true success rate in population
psucc = round(runif(N, min=25, 50)) # sizes of observed sample groups
group_size = rbinom(N, group_size, psucc) # successes in each group
nsucc = data.frame(n_success=nsucc,
hdata group_size=group_size,
stringsAsFactors = FALSE)
# observed rate of successes in each group
$rate_success = with(hdata, n_success/group_size)
hdata
= "Observed prevalence of success in population"
title
::PlotDistHistBeta(hdata, "rate_success", title) WVPlots
::PlotDistDensityBeta(hdata, "rate_success", title) WVPlots
= c(1,2,3,4,5,10,15,18,20,25)
y = seq_len(length(y))
x = data.frame(x=x,
df y=y,
stringsAsFactors = FALSE)
::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="centered smooth, one group") WVPlots
::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="left smooth, one group", align="left") WVPlots
::ConditionalSmoothedScatterPlot(df, "x", "y", NULL, title="right smooth, one group", align="right") WVPlots
= length(x)
n = rbind(data.frame(x=x, y=y+rnorm(n), gp="times 1", stringsAsFactors = FALSE),
df data.frame(x=x, y=0.5*y + rnorm(n), gp="times 1/2", stringsAsFactors = FALSE),
data.frame(x=x, y=2*y + rnorm(n), gp="times 2", stringsAsFactors = FALSE),
stringsAsFactors = FALSE)
::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="centered smooth, multigroup") WVPlots
::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="left smooth, multigroup", align="left") WVPlots
::ConditionalSmoothedScatterPlot(df, "x", "y", "gp", title="right smooth, multigroup", align="right") WVPlots
set.seed(52523)
= data.frame(meas=rnorm(100), stringsAsFactors = FALSE)
d = -1.5
threshold ::ShadedDensity(d, "meas", threshold,
WVPlotstitle="Example shaded density plot, left tail")
::ShadedDensity(d, "meas", -threshold, tail="right",
WVPlotstitle="Example shaded density plot, right tail")
set.seed(52523)
= data.frame(meas=rnorm(100), stringsAsFactors = FALSE)
d # first and third quartiles of the data (central 50%)
= quantile(d$meas, c(0.25, 0.75))
boundaries ::ShadedDensityCenter(d, "meas", boundaries,
WVPlotstitle="Example center-shaded density plot")