Source code for arviz.plots.ecdfplot

"""Plot ecdf or ecdf-difference plot with confidence bands."""
import numpy as np
from scipy.stats import uniform, binom

from ..rcparams import rcParams
from .plot_utils import get_plotting_function


[docs]def plot_ecdf( values, values2=None, cdf=None, difference=False, pit=False, confidence_bands=None, pointwise=False, npoints=100, num_trials=500, fpr=0.05, figsize=None, fill_band=True, plot_kwargs=None, fill_kwargs=None, plot_outline_kwargs=None, ax=None, show=None, backend=None, backend_kwargs=None, **kwargs ): r"""Plot ECDF or ECDF-Difference Plot with Confidence bands. Plots of the empirical CDF estimates of an array. When `values2` argument is provided, the two empirical CDFs are overlaid with the distribution of `values` on top (in a darker shade) and confidence bands in a more transparent shade. Optionally, the difference between the two empirical CDFs can be computed, and the PIT for a single dataset or a comparison between two samples. Notes ----- This plot computes the confidence bands with the simulated based algorithm presented in [1]_. Parameters ---------- values : array-like Values to plot from an unknown continuous or discrete distribution. values2 : array-like, optional Values to compare to the original sample. cdf : callable, optional Cumulative distribution function of the distribution to compare the original sample. difference : bool, default False If True then plot ECDF-difference plot otherwise ECDF plot. pit : bool, default False If True plots the ECDF or ECDF-diff of PIT of sample. confidence_bands : bool, default None If True plots the simultaneous or pointwise confidence bands with `1 - fpr` confidence level. pointwise : bool, default False If True plots pointwise confidence bands otherwise simultaneous bands. npoints : int, default 100 This denotes the granularity size of our plot i.e the number of evaluation points for the ecdf or ecdf-difference plots. num_trials : int, default 500 The number of random ECDFs to generate for constructing simultaneous confidence bands. fpr : float, default 0.05 The type I error rate s.t `1 - fpr` denotes the confidence level of bands. figsize : (float,float), optional Figure size. If `None` it will be defined automatically. fill_band : bool, default True If True it fills in between to mark the area inside the confidence interval. Otherwise, plot the border lines. plot_kwargs : dict, optional Additional kwargs passed to :func:`mpl:matplotlib.pyplot.step` or :meth:`bokeh.plotting.figure.step` fill_kwargs : dict, optional Additional kwargs passed to :func:`mpl:matplotlib.pyplot.fill_between` or :meth:`bokeh:bokeh.plotting.Figure.varea` plot_outline_kwargs : dict, optional Additional kwargs passed to :meth:`mpl:matplotlib.axes.Axes.plot` or :meth:`bokeh:bokeh.plotting.Figure.line` ax :axes, optional Matplotlib axes or bokeh figures. show : bool, optional Call backend show function. backend : {"matplotlib", "bokeh"}, default "matplotlib" Select plotting backend. backend_kwargs : dict, optional These are kwargs specific to the backend being used, passed to :func:`matplotlib.pyplot.subplots` or :class:`bokeh.plotting.figure`. For additional documentation check the plotting method of the backend. Returns ------- axes : matplotlib_axes or bokeh_figure References ---------- .. [1] Säilynoja, T., Bürkner, P.C. and Vehtari, A., 2021. Graphical Test for Discrete Uniformity and its Applications in Goodness of Fit Evaluation and Multiple Sample Comparison. arXiv preprint arXiv:2103.10522. Examples -------- Plot ecdf plot for a given sample .. plot:: :context: close-figs >>> import arviz as az >>> from scipy.stats import uniform, binom, norm >>> sample = norm(0,1).rvs(1000) >>> az.plot_ecdf(sample) Plot ecdf plot with confidence bands for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> distribution = norm(0,1) >>> az.plot_ecdf(sample, cdf = distribution.cdf, confidence_bands = True) Plot ecdf-difference plot with confidence bands for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> az.plot_ecdf(sample, cdf = distribution.cdf, >>> confidence_bands = True, difference = True) Plot ecdf plot with confidence bands for PIT of sample for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> az.plot_ecdf(sample, cdf = distribution.cdf, >>> confidence_bands = True, pit = True) Plot ecdf-difference plot with confidence bands for PIT of sample for comparing a given sample w.r.t a given distribution .. plot:: :context: close-figs >>> az.plot_ecdf(sample, cdf = distribution.cdf, >>> confidence_bands = True, difference = True, pit = True) You could also plot the above w.r.t another sample rather than a given distribution. For eg: Plot ecdf-difference plot with confidence bands for PIT of sample for comparing a given sample w.r.t a given sample .. plot:: :context: close-figs >>> sample2 = norm(0,1).rvs(5000) >>> az.plot_ecdf(sample, sample2, confidence_bands = True, difference = True, pit = True) """ if confidence_bands is None: confidence_bands = (values2 is not None) or (cdf is not None) if values2 is None and cdf is None and confidence_bands is True: raise ValueError("For confidence bands you need to specify values2 or the cdf") if cdf is not None and values2 is not None: raise ValueError("To compare sample you need either cdf or values2 and not both") if values2 is None and cdf is None and pit is True: raise ValueError("For PIT specify either cdf or values2") if values2 is None and cdf is None and difference is True: raise ValueError("For ECDF difference plot need either cdf or values2") if values2 is not None: values2 = np.ravel(values2) values2.sort() values = np.ravel(values) values.sort() ## This block computes gamma and uses it to get the upper and lower confidence bands ## Here we check if we want confidence bands or not if confidence_bands: ## If plotting PIT then we find the PIT values of sample. ## Basically here we generate the evaluation points(x) and find the PIT values. ## z is the evaluation point for our uniform distribution in compute_gamma() if pit: x = np.linspace(1 / npoints, 1, npoints) z = x ## Finding PIT for our sample probs = cdf(values) if cdf else compute_ecdf(values2, values) / len(values2) else: ## If not PIT use sample for plots and for evaluation points(x) use equally spaced ## points between minimum and maximum of sample ## For z we have used cdf(x) x = np.linspace(values[0], values[-1], npoints) z = cdf(x) if cdf else compute_ecdf(values2, x) probs = values n = len(values) # number of samples ## Computing gamma gamma = fpr if pointwise else compute_gamma(n, z, npoints, num_trials, fpr) ## Using gamma to get the confidence intervals lower, higher = get_lims(gamma, n, z) ## This block is for whether to plot ECDF or ECDF-difference if not difference: ## We store the coordinates of our ecdf in x_coord, y_coord x_coord, y_coord = get_ecdf_points(x, probs, difference) else: ## Here we subtract the ecdf value as here we are plotting the ECDF-difference x_coord, y_coord = get_ecdf_points(x, probs, difference) for i, x_i in enumerate(x): y_coord[i] = y_coord[i] - ( x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i) ) ## Similarly we subtract from the upper and lower bounds if pit: lower = lower - x higher = higher - x else: lower = lower - (cdf(x) if cdf else compute_ecdf(values2, x)) higher = higher - (cdf(x) if cdf else compute_ecdf(values2, x)) else: if pit: x = np.linspace(1 / npoints, 1, npoints) probs = cdf(values) else: x = np.linspace(values[0], values[-1], npoints) probs = values lower, higher = None, None ## This block is for whether to plot ECDF or ECDF-difference if not difference: x_coord, y_coord = get_ecdf_points(x, probs, difference) else: ## Here we subtract the ecdf value as here we are plotting the ECDF-difference x_coord, y_coord = get_ecdf_points(x, probs, difference) for i, x_i in enumerate(x): y_coord[i] = y_coord[i] - ( x_i if pit else cdf(x_i) if cdf else compute_ecdf(values2, x_i) ) ecdf_plot_args = dict( x_coord=x_coord, y_coord=y_coord, x_bands=x, lower=lower, higher=higher, confidence_bands=confidence_bands, figsize=figsize, fill_band=fill_band, plot_kwargs=plot_kwargs, fill_kwargs=fill_kwargs, plot_outline_kwargs=plot_outline_kwargs, ax=ax, show=show, backend_kwargs=backend_kwargs, **kwargs ) if backend is None: backend = rcParams["plot.backend"] backend = backend.lower() plot = get_plotting_function("plot_ecdf", "ecdfplot", backend) ax = plot(**ecdf_plot_args) return ax
def compute_ecdf(sample, z): """Compute ECDF. This function computes the ecdf value at the evaluation point or a sorted set of evaluation points. """ return np.searchsorted(sample, z, side="right") / len(sample) def get_ecdf_points(x, probs, difference): """Compute the coordinates for the ecdf points using compute_ecdf.""" y = compute_ecdf(probs, x) if not difference: x = np.insert(x, 0, x[0]) y = np.insert(y, 0, 0) return x, y def compute_gamma(n, z, npoints=None, num_trials=1000, fpr=0.05): """Compute gamma for confidence interval calculation. This function simulates an adjusted value of gamma to account for multiplicity when forming an 1-fpr level confidence envelope for the ECDF of a sample. """ if npoints is None: npoints = n gamma = [] for _ in range(num_trials): unif_samples = uniform.rvs(0, 1, n) unif_samples = np.sort(unif_samples) gamma_m = 1000 ## Can compute ecdf for all the z together or one at a time. f_z = compute_ecdf(unif_samples, z) f_z = compute_ecdf(unif_samples, z) gamma_m = 2 * min( np.amin(binom.cdf(n * f_z, n, z)), np.amin(1 - binom.cdf(n * f_z - 1, n, z)) ) gamma.append(gamma_m) return np.quantile(gamma, fpr) def get_lims(gamma, n, z): """Compute the simultaneous 1 - fpr level confidence bands.""" lower = binom.ppf(gamma / 2, n, z) upper = binom.ppf(1 - gamma / 2, n, z) return lower / n, upper / n