Source code for arviz.plots.ecdfplot

"""Plot ecdf or ecdf-difference plot with confidence bands."""
import numpy as np
from scipy.stats import uniform, binom

from ..rcparams import rcParams
from .plot_utils import get_plotting_function


[docs]def plot_ecdf( values, values2=None, cdf=None, difference=False, pit=False, confidence_bands=None, pointwise=False, npoints=100, num_trials=500, fpr=0.05, figsize=None, fill_band=True, plot_kwargs=None, fill_kwargs=None, plot_outline_kwargs=None, ax=None, show=None, backend=None, backend_kwargs=None, **kwargs ): """Plot ECDF or ECDF-Difference Plot with Confidence bands. This plot uses the simulated based algorithm presented in the paper "Graphical Test for Discrete Uniformity and its Applications in Goodness of Fit Evaluation and Multiple Sample Comparison" [1]_. Parameters ---------- values : array-like Values to plot from an unknown continuous or discrete distribution values2 : array-like, optional Values to compare to the original sample cdf : function, optional Cumulative distribution function of the distribution to compare the original sample to difference : bool, optional, Defaults False If true then plot ECDF-difference plot otherwise ECDF plot pit : bool, optional If True plots the ECDF or ECDF-diff of PIT of sample confidence_bands : bool, optional, Defaults True If True plots the simultaneous or pointwise confidence bands with 1 - fpr confidence level pointwise : bool, optional, Defaults False If True plots pointwise confidence bands otherwise simultaneous bands npoints : int, optional, Defaults 100 This denotes the granularity size of our plot i.e the number of evaluation points for our ecdf or ecdf-difference plot num_trials : int, optional, Defaults 500 The number of random ECDFs to generate to construct simultaneous confidence bands fpr : float, optional, Defaults 0.05 The type I error rate s.t 1 - fpr denotes the confidence level of bands figsize : tuple, optional Figure size. If None it will be defined automatically. fill_band : bool, optional Use fill_between to mark the area inside the credible interval. Otherwise, plot the border lines. plot_kwargs : dict, optional Additional kwargs passed to :func:`mpl:matplotlib.pyplot.step` or :meth:`bokeh:bokeh.plotting.Figure.step` fill_kwargs : dict, optional Additional kwargs passed to :func:`mpl:matplotlib.pyplot.fill_between` or :meth:`bokeh:bokeh.plotting.Figure.varea` plot_outline_kwargs : dict, optional Additional kwargs passed to :meth:`mpl:matplotlib.axes.Axes.plot` or :meth:`bokeh:bokeh.plotting.Figure.line` ax : axes, optional Matplotlib axes or bokeh figures. show : bool, optional Call backend show function. backend : str, optional Select plotting backend {"matplotlib","bokeh"}. Default "matplotlib". backend_kwargs : dict, optional These are kwargs specific to the backend being used, passed to :func:`mpl:matplotlib.pyplot.subplots` or :meth:`bokeh:bokeh.plotting.figure`. Returns ------- axes : matplotlib axes or bokeh figures References ---------- .. [1] Säilynoja, T., Bürkner, P.C. and Vehtari, A., 2021. Graphical Test for Discrete Uniformity and its Applications in Goodness of Fit Evaluation and Multiple Sample Comparison. arXiv preprint arXiv:2103.10522. Examples -------- Plot ecdf plot for a given sample .. plot:: :context: close-figs >>> import arviz as az >>> from scipy.stats import uniform, binom, norm >>> sample = norm(0,1).rvs(1000) >>> az.plot_ecdf(sample) Plot ecdf plot with confidence bands for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> distribution = norm(0,1) >>> az.plot_ecdf(sample, cdf = distribution.cdf, confidence_bands = True) Plot ecdf-difference plot with confidence bands for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> az.plot_ecdf(sample, cdf = distribution.cdf, >>> confidence_bands = True, difference = True) Plot ecdf plot with confidence bands for PIT of sample for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> az.plot_ecdf(sample, cdf = distribution.cdf, >>> confidence_bands = True, pit = True) Plot ecdf-difference plot with confidence bands for PIT of sample for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> az.plot_ecdf(sample, cdf = distribution.cdf, >>> confidence_bands = True, difference = True, pit = True) You could also plot the above w.r.t another sample rather than a given distribution. For eg: Plot ecdf-difference plot with confidence bands for PIT of sample for comparing a given sample w.r.t a given sample .. plot:: :context: close-figs >>> sample2 = norm(0,1).rvs(5000) >>> az.plot_ecdf(sample, sample2, confidence_bands = True, difference = True, pit = True) """ if confidence_bands is None: confidence_bands = (values2 is not None) or (cdf is not None) if values2 is None and cdf is None and confidence_bands is True: raise ValueError("For confidence bands you need to specify values2 or the cdf") if cdf is not None and values2 is not None: raise ValueError("To compare sample you need either cdf or values2 and not both") if values2 is None and cdf is None and pit is True: raise ValueError("For PIT specify either cdf or values2") if values2 is None and cdf is None and difference is True: raise ValueError("For ECDF difference plot need either cdf or values2") if values2 is not None: values2 = np.ravel(values2) values2.sort() values = np.ravel(values) values.sort() ## This block computes gamma and uses it to get the upper and lower confidence bands ## Here we check if we want confidence bands or not if confidence_bands: ## If plotting PIT then we find the PIT values of sample. ## Basically here we generate the evaluation points(x) and find the PIT values. ## z is the evaluation point for our uniform distribution in compute_gamma() if pit: x = np.linspace(1 / npoints, 1, npoints) z = x ## Finding PIT for our sample probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2) else: ## If not PIT use sample for plots and for evaluation points(x) use equally spaced ## points between minimum and maximum of sample ## For z we have used cdf(x) x = np.linspace(values[0], values[-1], npoints) z = cdf(x) if cdf else compute_ecdf(values2, x) probs = values n = len(values) # number of samples ## Computing gamma gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr) ## Using gamma to get the confidence intervals lower, higher = get_lims(gamma, n, z) ## This block is for whether to plot ECDF or ECDF-difference if not difference: ## We store the coordinates of our ecdf in x_coord, y_coord x_coord, y_coord = get_ecdf_points(x, probs, difference) else: ## Here we subtract the ecdf value as here we are plotting the ECDF-difference x_coord, y_coord = get_ecdf_points(x, probs, difference) for i, x_i in enumerate(x): y_coord[i] = y_coord[i] - ( x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i) ) ## Similarly we subtract from the upper and lower bounds if pit: lower = lower - x higher = higher - x else: lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x)) higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x)) else: if pit: x = np.linspace(1 / npoints, 1, npoints) probs = cdf(values) else: x = np.linspace(values[0], values[-1], npoints) probs = values lower, higher = None, None ## This block is for whether to plot ECDF or ECDF-difference if not difference: x_coord, y_coord = get_ecdf_points(x, probs, difference) else: ## Here we subtract the ecdf value as here we are plotting the ECDF-difference x_coord, y_coord = get_ecdf_points(x, probs, difference) for i, x_i in enumerate(x): y_coord[i] = y_coord[i] - ( x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i) ) ecdf_plot_args = dict( x_coord=x_coord, y_coord=y_coord, x_bands=x, lower=lower, higher=higher, confidence_bands=confidence_bands, figsize=figsize, fill_band=fill_band, plot_kwargs=plot_kwargs, fill_kwargs=fill_kwargs, plot_outline_kwargs=plot_outline_kwargs, ax=ax, show=show, backend_kwargs=backend_kwargs, **kwargs ) if backend is None: backend = rcParams["plot.backend"] backend = backend.lower() plot = get_plotting_function("plot_ecdf", "ecdfplot", backend) ax = plot(**ecdf_plot_args) return ax
def compute_ecdf(sample, z): """Compute ECDF. This function computes the ecdf value at the evaluation point or a sorted set of evaluation points. """ return np.searchsorted(sample, z, side="right") / len(sample) def get_ecdf_points(x, probs, difference): """Compute the coordinates for the ecdf points using compute_ecdf.""" y = compute_ecdf(probs, x) if not difference: x = np.insert(x, 0, x[0]) y = np.insert(y, 0, 0) return x, y def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05): """Compute gamma for confidence interval calculation. This function simulates an adjusted value of gamma to account for multiplicity when forming an 1-fpr level confidence envelope for the ECDF of a sample. """ if npoints is None: npoints = n gamma = [] for _ in range(num_trials): unif_samples = uniform.rvs(0, 1, n) unif_samples = np.sort(unif_samples) gamma_m = 1000 ## Can compute ecdf for all the z together or one at a time. f_z = compute_ecdf(unif_samples, z) f_z = compute_ecdf(unif_samples, z) gamma_m = 2 * min( np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z)) ) gamma.append(gamma_m) return np.quantile(gamma, fpr) def get_lims(gamma, n, z): """Compute the simultaneous 1 - fpr level confidence bands.""" lower = binom.ppf(gamma / 2, n, z) upper = binom.ppf(1 - gamma / 2, n, z) return lower / n, upper / n