"""Plot ecdf or ecdf-difference plot with confidence bands."""
import numpy as np
from scipy.stats import uniform, binom
from ..rcparams import rcParams
from .plot_utils import get_plotting_function
[docs]def plot_ecdf(
values,
values2=None,
cdf=None,
difference=False,
pit=False,
confidence_bands=None,
pointwise=False,
npoints=100,
num_trials=500,
fpr=0.05,
figsize=None,
fill_band=True,
plot_kwargs=None,
fill_kwargs=None,
plot_outline_kwargs=None,
ax=None,
show=None,
backend=None,
backend_kwargs=None,
**kwargs
):
"""Plot ECDF or ECDF-Difference Plot with Confidence bands.
This plot uses the simulated based algorithm presented in the paper "Graphical Test for
Discrete Uniformity and its Applications in Goodness of Fit Evaluation and
Multiple Sample Comparison" [1]_.
Parameters
----------
values : array-like
Values to plot from an unknown continuous or discrete distribution
values2 : array-like, optional
Values to compare to the original sample
cdf : function, optional
Cumulative distribution function of the distribution to compare the original sample to
difference : bool, optional, Defaults False
If true then plot ECDF-difference plot otherwise ECDF plot
pit : bool, optional
If True plots the ECDF or ECDF-diff of PIT of sample
confidence_bands : bool, optional, Defaults True
If True plots the simultaneous or pointwise confidence bands with 1 - fpr confidence level
pointwise : bool, optional, Defaults False
If True plots pointwise confidence bands otherwise simultaneous bands
npoints : int, optional, Defaults 100
This denotes the granularity size of our plot
i.e the number of evaluation points for our ecdf or ecdf-difference plot
num_trials : int, optional, Defaults 500
The number of random ECDFs to generate to construct simultaneous confidence bands
fpr : float, optional, Defaults 0.05
The type I error rate s.t 1 - fpr denotes the confidence level of bands
figsize : tuple, optional
Figure size. If None it will be defined automatically.
fill_band : bool, optional
Use fill_between to mark the area inside the credible interval.
Otherwise, plot the border lines.
plot_kwargs : dict, optional
Additional kwargs passed to :func:`mpl:matplotlib.pyplot.step` or
:meth:`bokeh:bokeh.plotting.Figure.step`
fill_kwargs : dict, optional
Additional kwargs passed to :func:`mpl:matplotlib.pyplot.fill_between` or
:meth:`bokeh:bokeh.plotting.Figure.varea`
plot_outline_kwargs : dict, optional
Additional kwargs passed to :meth:`mpl:matplotlib.axes.Axes.plot` or
:meth:`bokeh:bokeh.plotting.Figure.line`
ax : axes, optional
Matplotlib axes or bokeh figures.
show : bool, optional
Call backend show function.
backend : str, optional
Select plotting backend {"matplotlib","bokeh"}. Default "matplotlib".
backend_kwargs : dict, optional
These are kwargs specific to the backend being used, passed to
:func:`mpl:matplotlib.pyplot.subplots` or
:meth:`bokeh:bokeh.plotting.figure`.
Returns
-------
axes : matplotlib axes or bokeh figures
References
----------
.. [1] Säilynoja, T., Bürkner, P.C. and Vehtari, A., 2021. Graphical Test for
Discrete Uniformity and its Applications in Goodness of Fit Evaluation and
Multiple Sample Comparison. arXiv preprint arXiv:2103.10522.
Examples
--------
Plot ecdf plot for a given sample
.. plot::
:context: close-figs
>>> import arviz as az
>>> from scipy.stats import uniform, binom, norm
>>> sample = norm(0,1).rvs(1000)
>>> az.plot_ecdf(sample)
Plot ecdf plot with confidence bands for comparing a given sample w.r.t a given distribution
.. plot::
:context: close-figs
>>> distribution = norm(0,1)
>>> az.plot_ecdf(sample, cdf = distribution.cdf, confidence_bands = True)
Plot ecdf-difference plot with confidence bands for comparing a given sample
w.r.t a given distribution
.. plot::
:context: close-figs
>>> az.plot_ecdf(sample, cdf = distribution.cdf,
>>> confidence_bands = True, difference = True)
Plot ecdf plot with confidence bands for PIT of sample for comparing a given sample
w.r.t a given distribution
.. plot::
:context: close-figs
>>> az.plot_ecdf(sample, cdf = distribution.cdf,
>>> confidence_bands = True, pit = True)
Plot ecdf-difference plot with confidence bands for PIT of sample for comparing a given
sample w.r.t a given distribution
.. plot::
:context: close-figs
>>> az.plot_ecdf(sample, cdf = distribution.cdf,
>>> confidence_bands = True, difference = True, pit = True)
You could also plot the above w.r.t another sample rather than a given distribution.
For eg: Plot ecdf-difference plot with confidence bands for PIT of sample for
comparing a given sample w.r.t a given sample
.. plot::
:context: close-figs
>>> sample2 = norm(0,1).rvs(5000)
>>> az.plot_ecdf(sample, sample2, confidence_bands = True, difference = True, pit = True)
"""
if confidence_bands is None:
confidence_bands = (values2 is not None) or (cdf is not None)
if values2 is None and cdf is None and confidence_bands is True:
raise ValueError("For confidence bands you need to specify values2 or the cdf")
if cdf is not None and values2 is not None:
raise ValueError("To compare sample you need either cdf or values2 and not both")
if values2 is None and cdf is None and pit is True:
raise ValueError("For PIT specify either cdf or values2")
if values2 is None and cdf is None and difference is True:
raise ValueError("For ECDF difference plot need either cdf or values2")
if values2 is not None:
values2 = np.ravel(values2)
values2.sort()
values = np.ravel(values)
values.sort()
## This block computes gamma and uses it to get the upper and lower confidence bands
## Here we check if we want confidence bands or not
if confidence_bands:
## If plotting PIT then we find the PIT values of sample.
## Basically here we generate the evaluation points(x) and find the PIT values.
## z is the evaluation point for our uniform distribution in compute_gamma()
if pit:
x = np.linspace(1 / npoints, 1, npoints)
z = x
## Finding PIT for our sample
probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2)
else:
## If not PIT use sample for plots and for evaluation points(x) use equally spaced
## points between minimum and maximum of sample
## For z we have used cdf(x)
x = np.linspace(values[0], values[-1], npoints)
z = cdf(x) if cdf else compute_ecdf(values2, x)
probs = values
n = len(values) # number of samples
## Computing gamma
gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr)
## Using gamma to get the confidence intervals
lower, higher = get_lims(gamma, n, z)
## This block is for whether to plot ECDF or ECDF-difference
if not difference:
## We store the coordinates of our ecdf in x_coord, y_coord
x_coord, y_coord = get_ecdf_points(x, probs, difference)
else:
## Here we subtract the ecdf value as here we are plotting the ECDF-difference
x_coord, y_coord = get_ecdf_points(x, probs, difference)
for i, x_i in enumerate(x):
y_coord[i] = y_coord[i] - (
x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
)
## Similarly we subtract from the upper and lower bounds
if pit:
lower = lower - x
higher = higher - x
else:
lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x))
higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x))
else:
if pit:
x = np.linspace(1 / npoints, 1, npoints)
probs = cdf(values)
else:
x = np.linspace(values[0], values[-1], npoints)
probs = values
lower, higher = None, None
## This block is for whether to plot ECDF or ECDF-difference
if not difference:
x_coord, y_coord = get_ecdf_points(x, probs, difference)
else:
## Here we subtract the ecdf value as here we are plotting the ECDF-difference
x_coord, y_coord = get_ecdf_points(x, probs, difference)
for i, x_i in enumerate(x):
y_coord[i] = y_coord[i] - (
x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i)
)
ecdf_plot_args = dict(
x_coord=x_coord,
y_coord=y_coord,
x_bands=x,
lower=lower,
higher=higher,
confidence_bands=confidence_bands,
figsize=figsize,
fill_band=fill_band,
plot_kwargs=plot_kwargs,
fill_kwargs=fill_kwargs,
plot_outline_kwargs=plot_outline_kwargs,
ax=ax,
show=show,
backend_kwargs=backend_kwargs,
**kwargs
)
if backend is None:
backend = rcParams["plot.backend"]
backend = backend.lower()
plot = get_plotting_function("plot_ecdf", "ecdfplot", backend)
ax = plot(**ecdf_plot_args)
return ax
def compute_ecdf(sample, z):
"""Compute ECDF.
This function computes the ecdf value at the evaluation point
or a sorted set of evaluation points.
"""
return np.searchsorted(sample, z, side="right") / len(sample)
def get_ecdf_points(x, probs, difference):
"""Compute the coordinates for the ecdf points using compute_ecdf."""
y = compute_ecdf(probs, x)
if not difference:
x = np.insert(x, 0, x[0])
y = np.insert(y, 0, 0)
return x, y
def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05):
"""Compute gamma for confidence interval calculation.
This function simulates an adjusted value of gamma to account for multiplicity
when forming an 1-fpr level confidence envelope for the ECDF of a sample.
"""
if npoints is None:
npoints = n
gamma = []
for _ in range(num_trials):
unif_samples = uniform.rvs(0, 1, n)
unif_samples = np.sort(unif_samples)
gamma_m = 1000
## Can compute ecdf for all the z together or one at a time.
f_z = compute_ecdf(unif_samples, z)
f_z = compute_ecdf(unif_samples, z)
gamma_m = 2 * min(
np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z))
)
gamma.append(gamma_m)
return np.quantile(gamma, fpr)
def get_lims(gamma, n, z):
"""Compute the simultaneous 1 - fpr level confidence bands."""
lower = binom.ppf(gamma / 2, n, z)
upper = binom.ppf(1 - gamma / 2, n, z)
return lower / n, upper / n