Source code for arviz.plots.loopitplot

"""Plot LOO-PIT predictive checks of inference data."""

import numpy as np
from scipy import stats

from ..labels import BaseLabeller
from ..rcparams import rcParams
from ..stats import loo_pit as _loo_pit
from ..stats.density_utils import kde
from .plot_utils import get_plotting_function



[docs]
def plot_loo_pit(
    idata=None,
    y=None,
    y_hat=None,
    log_weights=None,
    ecdf=False,
    ecdf_fill=True,
    n_unif=100,
    use_hdi=False,
    hdi_prob=None,
    figsize=None,
    textsize=None,
    labeller=None,
    color="C0",
    legend=True,
    ax=None,
    plot_kwargs=None,
    plot_unif_kwargs=None,
    hdi_kwargs=None,
    fill_kwargs=None,
    backend=None,
    backend_kwargs=None,
    show=None,
):
    """Plot Leave-One-Out (LOO) probability integral transformation (PIT) predictive checks.

    Parameters
    ----------
    idata : InferenceData
        :class:`arviz.InferenceData` object.
    y : array, DataArray or str
        Observed data. If str, ``idata`` must be present and contain the observed data group
    y_hat : array, DataArray or str
        Posterior predictive samples for ``y``. It must have the same shape as y plus an
        extra dimension at the end of size n_samples (chains and draws stacked). If str or
        None, ``idata`` must contain the posterior predictive group. If None, ``y_hat`` is taken
        equal to y, thus, y must be str too.
    log_weights : array or DataArray
        Smoothed log_weights. It must have the same shape as ``y_hat``
    ecdf : bool, optional
        Plot the difference between the LOO-PIT Empirical Cumulative Distribution Function
        (ECDF) and the uniform CDF instead of LOO-PIT kde.
        In this case, instead of overlaying uniform distributions, the beta ``hdi_prob``
        around the theoretical uniform CDF is shown. This approximation only holds
        for large S and ECDF values not very close to 0 nor 1. For more information, see
        `Vehtari et al. (2019)`, `Appendix G <https://avehtari.github.io/rhat_ess/rhat_ess.html>`_.
    ecdf_fill : bool, optional
        Use :meth:`matplotlib.axes.Axes.fill_between` to mark the area
        inside the credible interval. Otherwise, plot the
        border lines.
    n_unif : int, optional
        Number of datasets to simulate and overlay from the uniform distribution.
    use_hdi : bool, optional
        Compute expected hdi values instead of overlaying the sampled uniform distributions.
    hdi_prob : float, optional
        Probability for the highest density interval. Works with ``use_hdi=True`` or ``ecdf=True``.
    figsize : (float, float), optional
        If None, size is (8 + numvars, 8 + numvars)
    textsize : int, optional
        Text size for labels. If None it will be autoscaled based on ``figsize``.
    labeller : Labeller, optional
        Class providing the method ``make_pp_label`` to generate the labels in the plot titles.
        Read the :ref:`label_guide` for more details and usage examples.
    color : str or array_like, optional
        Color of the LOO-PIT estimated pdf plot. If ``plot_unif_kwargs`` has no "color" key,
        a slightly lighter color than this argument will be used for the uniform kde lines.
        This will ensure that LOO-PIT kde and uniform kde have different default colors.
    legend : bool, optional
        Show the legend of the figure.
    ax : axes, optional
        Matplotlib axes or bokeh figures.
    plot_kwargs : dict, optional
        Additional keywords passed to :meth:`matplotlib.axes.Axes.plot`
        for LOO-PIT line (kde or ECDF)
    plot_unif_kwargs : dict, optional
        Additional keywords passed to :meth:`matplotlib.axes.Axes.plot` for
        overlaid uniform distributions or for beta credible interval
        lines if ``ecdf=True``
    hdi_kwargs : dict, optional
        Additional keywords passed to :meth:`matplotlib.axes.Axes.axhspan`
    fill_kwargs : dict, optional
        Additional kwargs passed to :meth:`matplotlib.axes.Axes.fill_between`
    backend : str, optional
        Select plotting backend {"matplotlib","bokeh"}. Default "matplotlib".
    backend_kwargs : bool, optional
        These are kwargs specific to the backend being used, passed to
        :func:`matplotlib.pyplot.subplots` or
        :func:`bokeh.plotting.figure`. For additional documentation
        check the plotting method of the backend.
    show : bool, optional
        Call backend show function.

    Returns
    -------
    axes : matplotlib_axes or bokeh_figures

    See Also
    --------
    plot_bpv : Plot Bayesian p-value for observed data and Posterior/Prior predictive.
    loo_pit : Compute leave one out (PSIS-LOO) probability integral transform (PIT) values.

    References
    ----------
    * Gabry et al. (2017) see https://arxiv.org/abs/1709.01449
    * https://mc-stan.org/bayesplot/reference/PPC-loo.html
    * Gelman et al. BDA (2014) Section 6.3

    Examples
    --------
    Plot LOO-PIT predictive checks overlaying the KDE of the LOO-PIT values to several
    realizations of uniform variable sampling with the same number of observations.

    .. plot::
        :context: close-figs

        >>> import arviz as az
        >>> idata = az.load_arviz_data("radon")
        >>> az.plot_loo_pit(idata=idata, y="y")

    Fill the area containing the 94% highest density interval of the difference between uniform
    variables empirical CDF and the real uniform CDF. A LOO-PIT ECDF clearly outside of these
    theoretical boundaries indicates that the observations and the posterior predictive
    samples do not follow the same distribution.

    .. plot::
        :context: close-figs

        >>> az.plot_loo_pit(idata=idata, y="y", ecdf=True)

    """
    if ecdf and use_hdi:
        raise ValueError("use_hdi is incompatible with ecdf plot")

    if labeller is None:
        labeller = BaseLabeller()

    loo_pit = _loo_pit(idata=idata, y=y, y_hat=y_hat, log_weights=log_weights)
    loo_pit = loo_pit.flatten() if isinstance(loo_pit, np.ndarray) else loo_pit.values.flatten()

    loo_pit_ecdf = None
    unif_ecdf = None
    p975 = None
    p025 = None
    loo_pit_kde = None
    hdi_odds = None
    unif = None
    x_vals = None

    if hdi_prob is None:
        hdi_prob = rcParams["stats.hdi_prob"]
    elif not 1 >= hdi_prob > 0:
        raise ValueError("The value of hdi_prob should be in the interval (0, 1]")

    if ecdf:
        loo_pit.sort()
        n_data_points = loo_pit.size
        loo_pit_ecdf = np.arange(n_data_points) / n_data_points
        # ideal unnormalized ECDF of uniform distribution with n_data_points points
        # it is used indistinctively as x or p(u<x) because for u~U(0,1) they are equal
        unif_ecdf = np.arange(n_data_points + 1)
        p975 = stats.beta.ppf(0.5 + hdi_prob / 2, unif_ecdf + 1, n_data_points - unif_ecdf + 1)
        p025 = stats.beta.ppf(0.5 - hdi_prob / 2, unif_ecdf + 1, n_data_points - unif_ecdf + 1)
        unif_ecdf = unif_ecdf / n_data_points
    else:
        x_vals, loo_pit_kde = kde(loo_pit)

        unif = np.random.uniform(size=(n_unif, loo_pit.size))
        if use_hdi:
            n_obs = loo_pit.size
            hdi_ = stats.beta(n_obs / 2, n_obs / 2).ppf((1 - hdi_prob) / 2)
            hdi_odds = (hdi_ / (1 - hdi_), (1 - hdi_) / hdi_)

    loo_pit_kwargs = dict(
        ax=ax,
        figsize=figsize,
        ecdf=ecdf,
        loo_pit=loo_pit,
        loo_pit_ecdf=loo_pit_ecdf,
        unif_ecdf=unif_ecdf,
        p975=p975,
        p025=p025,
        fill_kwargs=fill_kwargs,
        ecdf_fill=ecdf_fill,
        use_hdi=use_hdi,
        x_vals=x_vals,
        hdi_kwargs=hdi_kwargs,
        hdi_odds=hdi_odds,
        n_unif=n_unif,
        unif=unif,
        plot_unif_kwargs=plot_unif_kwargs,
        loo_pit_kde=loo_pit_kde,
        textsize=textsize,
        labeller=labeller,
        color=color,
        legend=legend,
        y_hat=y_hat,
        y=y,
        hdi_prob=hdi_prob,
        plot_kwargs=plot_kwargs,
        backend_kwargs=backend_kwargs,
        show=show,
    )

    if backend is None:
        backend = rcParams["plot.backend"]
    backend = backend.lower()

    # TODO: Add backend kwargs
    plot = get_plotting_function("plot_loo_pit", "loopitplot", backend)
    axes = plot(**loo_pit_kwargs)

    return axes