Source code for mars.tensor.stats.ks

# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
import warnings
from math import gcd
from collections import namedtuple
from typing import Callable, Tuple, Union

import numpy as np
from scipy import special
from scipy.stats import distributions

from ... import tensor as mt
from ...core import ExecutableTuple
from ...typing import TileableType


KstestResult = namedtuple("KstestResult", ("statistic", "pvalue"))
Ks_2sampResult = KstestResult


def _compute_prob_inside_method(m, n, g, h):  # pragma: no cover
    """
    Count the proportion of paths that stay strictly inside two diagonal lines.

    Parameters
    ----------
    m : integer
        m > 0
    n : integer
        n > 0
    g : integer
        g is greatest common divisor of m and n
    h : integer
        0 <= h <= lcm(m,n)

    Returns
    -------
    p : float
        The proportion of paths that stay inside the two lines.


    Count the integer lattice paths from (0, 0) to (m, n) which satisfy
    |x/m - y/n| < h / lcm(m, n).
    The paths make steps of size +1 in either positive x or positive y directions.

    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
    Hodges, J.L. Jr.,
    "The Significance Probability of the Smirnov Two-Sample Test,"
    Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.

    """
    # Probability is symmetrical in m, n.  Computation below uses m >= n.
    if m < n:
        m, n = n, m
    mg = m // g
    ng = n // g

    # Count the integer lattice paths from (0, 0) to (m, n) which satisfy
    # |nx/g - my/g| < h.
    # Compute matrix A such that:
    #  A(x, 0) = A(0, y) = 1
    #  A(x, y) = A(x, y-1) + A(x-1, y), for x,y>=1, except that
    #  A(x, y) = 0 if |x/m - y/n|>= h
    # Probability is A(m, n)/binom(m+n, n)
    # Optimizations exist for m==n, m==n*p.
    # Only need to preserve a single column of A, and only a sliding window of it.
    # minj keeps track of the slide.
    minj, maxj = 0, min(int(np.ceil(h / mg)), n + 1)
    curlen = maxj - minj
    # Make a vector long enough to hold maximum window needed.
    lenA = min(2 * maxj + 2, n + 1)
    # This is an integer calculation, but the entries are essentially
    # binomial coefficients, hence grow quickly.
    # Scaling after each column is computed avoids dividing by a
    # large binomial coefficient at the end, but is not sufficient to avoid
    # the large dynamic range which appears during the calculation.
    # Instead we rescale based on the magnitude of the right most term in
    # the column and keep track of an exponent separately and apply
    # it at the end of the calculation.  Similarly when multiplying by
    # the binomial coefficient
    dtype = np.float64
    A = np.zeros(lenA, dtype=dtype)
    # Initialize the first column
    A[minj:maxj] = 1
    expnt = 0
    for i in range(1, m + 1):
        # Generate the next column.
        # First calculate the sliding window
        lastminj, lastlen = minj, curlen
        minj = max(int(np.floor((ng * i - h) / mg)) + 1, 0)
        minj = min(minj, n)
        maxj = min(int(np.ceil((ng * i + h) / mg)), n + 1)
        if maxj <= minj:
            return 0
        # Now fill in the values
        A[0 : maxj - minj] = np.cumsum(A[minj - lastminj : maxj - lastminj])
        curlen = maxj - minj
        if lastlen > curlen:
            # Set some carried-over elements to 0
            A[maxj - minj : maxj - minj + (lastlen - curlen)] = 0
        # Rescale if the right most value is over 2**900
        val = A[maxj - minj - 1]
        _, valexpt = math.frexp(val)
        if valexpt > 900:
            # Scaling to bring down to about 2**800 appears
            # sufficient for sizes under 10000.
            valexpt -= 800
            A = np.ldexp(A, -valexpt)
            expnt += valexpt

    val = A[maxj - minj - 1]
    # Now divide by the binomial (m+n)!/m!/n!
    for i in range(1, n + 1):
        val = (val * i) / (m + i)
        _, valexpt = math.frexp(val)
        if valexpt < -128:
            val = np.ldexp(val, -valexpt)
            expnt += valexpt
    # Finally scale if needed.
    return np.ldexp(val, expnt)


def _compute_prob_outside_square(n, h):  # pragma: no cover
    """
    Compute the proportion of paths that pass outside the two diagonal lines.

    Parameters
    ----------
    n : integer
        n > 0
    h : integer
        0 <= h <= n

    Returns
    -------
    p : float
        The proportion of paths that pass outside the lines x-y = +/-h.

    """
    # Compute Pr(D_{n,n} >= h/n)
    # Prob = 2 * ( binom(2n, n-h) - binom(2n, n-2a) + binom(2n, n-3a) - ... )  / binom(2n, n)
    # This formulation exhibits subtractive cancellation.
    # Instead divide each term by binom(2n, n), then factor common terms
    # and use a Horner-like algorithm
    # P = 2 * A0 * (1 - A1*(1 - A2*(1 - A3*(1 - A4*(...)))))

    P = 0.0
    k = int(np.floor(n / h))
    while k >= 0:
        p1 = 1.0
        # Each of the Ai terms has numerator and denominator with h simple terms.
        for j in range(h):
            p1 = (n - k * h - j) * p1 / (n + k * h + j + 1)
        P = p1 * (1.0 - P)
        k -= 1
    return 2 * P


def _count_paths_outside_method(m, n, g, h):  # pragma: no cover
    """
    Count the number of paths that pass outside the specified diagonal.

    Parameters
    ----------
    m : integer
        m > 0
    n : integer
        n > 0
    g : integer
        g is greatest common divisor of m and n
    h : integer
        0 <= h <= lcm(m,n)

    Returns
    -------
    p : float
        The number of paths that go low.
        The calculation may overflow - check for a finite answer.

    Raises
    ------
    FloatingPointError: Raised if the intermediate computation goes outside
    the range of a float.

    Notes
    -----
    Count the integer lattice paths from (0, 0) to (m, n), which at some
    point (x, y) along the path, satisfy:
      m*y <= n*x - h*g
    The paths make steps of size +1 in either positive x or positive y directions.

    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk.
    Hodges, J.L. Jr.,
    "The Significance Probability of the Smirnov Two-Sample Test,"
    Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.

    """
    # Compute #paths which stay lower than x/m-y/n = h/lcm(m,n)
    # B(x, y) = #{paths from (0,0) to (x,y) without previously crossing the boundary}
    #         = binom(x, y) - #{paths which already reached the boundary}
    # Multiply by the number of path extensions going from (x, y) to (m, n)
    # Sum.

    # Probability is symmetrical in m, n.  Computation below assumes m >= n.
    if m < n:
        m, n = n, m
    mg = m // g
    ng = n // g

    # Not every x needs to be considered.
    # xj holds the list of x values to be checked.
    # Wherever n*x/m + ng*h crosses an integer
    lxj = n + (mg - h) // mg
    xj = [(h + mg * j + ng - 1) // ng for j in range(lxj)]
    # B is an array just holding a few values of B(x,y), the ones needed.
    # B[j] == B(x_j, j)
    if lxj == 0:
        return np.round(special.binom(m + n, n))
    B = np.zeros(lxj)
    B[0] = 1
    # Compute the B(x, y) terms
    # The binomial coefficient is an integer, but special.binom() may return a float.
    # Round it to the nearest integer.
    for j in range(1, lxj):
        Bj = np.round(special.binom(xj[j] + j, j))
        if not np.isfinite(Bj):
            raise FloatingPointError()
        for i in range(j):
            bin = np.round(
                special.binom(xj[j] - xj[i] + j - i, j - i)
            )  # pylint: disable=redefined-builtin
            Bj -= bin * B[i]
        B[j] = Bj
        if not np.isfinite(Bj):
            raise FloatingPointError()
    # Compute the number of path extensions...
    num_paths = 0
    for j in range(lxj):
        bin = np.round(special.binom((m - xj[j]) + (n - j), n - j))
        term = B[j] * bin
        if not np.isfinite(term):
            raise FloatingPointError()
        num_paths += term
    return np.round(num_paths)


def _attempt_exact_2kssamp(n1, n2, g, d, alternative):  # pragma: no cover
    """Attempts to compute the exact 2sample probability.

    n1, n2 are the sample sizes
    g is the gcd(n1, n2)
    d is the computed max difference in ECDFs

    Returns (success, d, probability)
    """
    lcm = (n1 // g) * n2
    h = int(np.round(d * lcm))
    d = h * 1.0 / lcm
    if h == 0:
        return True, d, 1.0
    saw_fp_error, prob = False, np.nan
    try:
        if alternative == "two-sided":
            if n1 == n2:
                prob = _compute_prob_outside_square(n1, h)
            else:
                prob = 1 - _compute_prob_inside_method(n1, n2, g, h)
        else:
            if n1 == n2:
                # prob = binom(2n, n-h) / binom(2n, n)
                # Evaluating in that form incurs roundoff errors
                # from special.binom. Instead calculate directly
                jrange = np.arange(h)
                prob = np.prod((n1 - jrange) / (n1 + jrange + 1.0))
            else:
                num_paths = _count_paths_outside_method(n1, n2, g, h)
                bin = special.binom(n1 + n2, n1)  # pylint: disable=redefined-builtin
                if (
                    not np.isfinite(bin)
                    or not np.isfinite(num_paths)
                    or num_paths > bin
                ):
                    saw_fp_error = True
                else:
                    prob = num_paths / bin

    except FloatingPointError:
        saw_fp_error = True

    if saw_fp_error:
        return False, d, np.nan
    if not (0 <= prob <= 1):
        return False, d, prob
    return True, d, prob


def _calc_prob_2samp(d, n1, n2, alternative, mode):  # pragma: no cover
    MAX_AUTO_N = 10000  # 'auto' will attempt to be exact if n1,n2 <= MAX_AUTO_N

    g = gcd(n1, n2)
    n1g = n1 // g
    n2g = n2 // g
    prob = -mt.inf
    original_mode = mode
    if mode == "auto":
        mode = "exact" if max(n1, n2) <= MAX_AUTO_N else "asymp"
    elif mode == "exact":
        # If lcm(n1, n2) is too big, switch from exact to asymp
        if n1g >= np.iinfo(np.int_).max / n2g:
            mode = "asymp"
            warnings.warn(
                f"Exact ks_2samp calculation not possible with samples sizes "
                f"{n1} and {n2}. Switching to 'asymp'.",
                RuntimeWarning,
            )

    if mode == "exact":
        success, d, prob = _attempt_exact_2kssamp(n1, n2, g, d, alternative)
        if not success:
            mode = "asymp"
            if original_mode == "exact":
                warnings.warn(
                    f"ks_2samp: Exact calculation unsuccessful. "
                    f"Switching to mode={mode}.",
                    RuntimeWarning,
                )

    if mode == "asymp":
        # The product n1*n2 is large.  Use Smirnov's asymptotic formula.
        # Ensure float to avoid overflow in multiplication
        # sorted because the one-sided formula is not symmetric in n1, n2
        m, n = sorted([float(n1), float(n2)], reverse=True)
        en = m * n / (m + n)
        if alternative == "two-sided":
            prob = distributions.kstwo.sf(d, np.round(en))
        else:
            z = np.sqrt(en) * d
            # Use Hodges' suggested approximation Eqn 5.3
            # Requires m to be the larger of (n1, n2)
            expt = -2 * z**2 - 2 * z * (m + 2 * n) / np.sqrt(m * n * (m + n)) / 3.0
            prob = np.exp(expt)

    return np.clip(prob, 0, 1)


def _compute_dplus(cdfvals, n):
    """Computes D+ as used in the Kolmogorov-Smirnov test.

    Parameters
    ----------
    cdfvals: array_like
      Sorted array of CDF values between 0 and 1

    Returns
    -------
      Maximum distance of the CDF values below Uniform(0, 1)
    """
    return (mt.arange(1.0, n + 1) / n - cdfvals).max()


def _compute_dminus(cdfvals, n):
    """Computes D- as used in the Kolmogorov-Smirnov test.

    Parameters
    ----------
    cdfvals: array_like
      Sorted array of CDF values between 0 and 1

    Returns
    -------
      Maximum distance of the CDF values above Uniform(0, 1)
    """
    return (cdfvals - mt.arange(0.0, n) / n).max()


[docs]def ks_1samp(
    x: Union[np.ndarray, list, TileableType],
    cdf: Callable,
    args: Tuple = (),
    alternative: str = "two-sided",
    mode: str = "auto",
):
    """
    Performs the one-sample Kolmogorov-Smirnov test for goodness of fit.

    This test compares the underlying distribution F(x) of a sample
    against a given continuous distribution G(x). See Notes for a description
    of the available null and alternative hypotheses.

    Parameters
    ----------
    x : array_like
        a 1-D array of observations of iid random variables.
    cdf : callable
        callable used to calculate the cdf.
    args : tuple, sequence, optional
        Distribution parameters, used with `cdf`.
    alternative : {'two-sided', 'less', 'greater'}, optional
        Defines the null and alternative hypotheses. Default is 'two-sided'.
        Please see explanations in the Notes below.
    mode : {'auto', 'exact', 'approx', 'asymp'}, optional
        Defines the distribution used for calculating the p-value.
        The following options are available (default is 'auto'):

          * 'auto' : selects one of the other options.
          * 'exact' : uses the exact distribution of test statistic.
          * 'approx' : approximates the two-sided probability with twice
            the one-sided probability
          * 'asymp': uses asymptotic distribution of test statistic

    Returns
    -------
    statistic : float
        KS test statistic, either D, D+ or D- (depending on the value
        of 'alternative')
    pvalue :  float
        One-tailed or two-tailed p-value.

    See Also
    --------
    ks_2samp, kstest

    Notes
    -----
    There are three options for the null and corresponding alternative
    hypothesis that can be selected using the `alternative` parameter.

    - `two-sided`: The null hypothesis is that the two distributions are
      identical, F(x)=G(x) for all x; the alternative is that they are not
      identical.

    - `less`: The null hypothesis is that F(x) >= G(x) for all x; the
      alternative is that F(x) < G(x) for at least one x.

    - `greater`: The null hypothesis is that F(x) <= G(x) for all x; the
      alternative is that F(x) > G(x) for at least one x.

    Note that the alternative hypotheses describe the *CDFs* of the
    underlying distributions, not the observed values. For example,
    suppose x1 ~ F and x2 ~ G. If F(x) > G(x) for all x, the values in
    x1 tend to be less than those in x2.

    Examples
    --------
    >>> import numpy as np
    >>> from scipy import stats
    >>> import mars.tensor as mt
    >>> from mars.tensor.stats import ks_1samp

    >>> np.random.seed(12345678)  #fix random seed to get the same result
    >>> x = mt.linspace(-15, 15, 9, chunk_size=5)
    >>> ks_1samp(x, stats.norm.cdf).execute()
    (0.44435602715924361, 0.038850142705171065)

    >>> ks_1samp(stats.norm.rvs(size=100), stats.norm.cdf).execute()
    KstestResult(statistic=0.165471391799..., pvalue=0.007331283245...)

    *Test against one-sided alternative hypothesis*

    Shift distribution to larger values, so that `` CDF(x) < norm.cdf(x)``:

    >>> x = stats.norm.rvs(loc=0.2, size=100)
    >>> ks_1samp(x, stats.norm.cdf, alternative='less').execute()
    KstestResult(statistic=0.235488541678..., pvalue=1.158315030683...)

    Reject null hypothesis in favor of alternative hypothesis: less

    >>> ks_1samp(x, stats.norm.cdf, alternative='greater').execute()
    KstestResult(statistic=0.010167165616..., pvalue=0.972494973653...)

    Reject null hypothesis in favor of alternative hypothesis: greater

    >>> ks_1samp(x, stats.norm.cdf).execute()
    KstestResult(statistic=0.235488541678..., pvalue=2.316630061366...)

    Don't reject null hypothesis in favor of alternative hypothesis: two-sided

    *Testing t distributed random variables against normal distribution*

    With 100 degrees of freedom the t distribution looks close to the normal
    distribution, and the K-S test does not reject the hypothesis that the
    sample came from the normal distribution:

    >>> ks_1samp(stats.t.rvs(100, size=100), stats.norm.cdf).execute()
    KstestResult(statistic=0.077844250253..., pvalue=0.553155412513...)

    With 3 degrees of freedom the t distribution looks sufficiently different
    from the normal distribution, that we can reject the hypothesis that the
    sample came from the normal distribution at the 10% level:

    >>> ks_1samp(stats.t.rvs(3, size=100), stats.norm.cdf).execute()
    KstestResult(statistic=0.118967105356..., pvalue=0.108627114578...)
    """
    alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get(
        alternative.lower()[0], alternative
    )
    if alternative not in ["two-sided", "greater", "less"]:
        raise ValueError("Unexpected alternative %s" % alternative)

    x = mt.asarray(x)
    N = x.shape[0]
    x = mt.sort(x)
    cdfvals = x.map_chunk(cdf, args=args, elementwise=True)

    if alternative == "greater":
        Dplus = _compute_dplus(cdfvals, N)
        return ExecutableTuple(
            KstestResult(Dplus, Dplus.map_chunk(distributions.ksone.sf, args=(N,)))
        )

    if alternative == "less":
        Dminus = _compute_dminus(cdfvals, N)
        return ExecutableTuple(
            KstestResult(Dminus, Dminus.map_chunk(distributions.ksone.sf, args=(N,)))
        )

    # alternative == 'two-sided':
    Dplus = _compute_dplus(cdfvals, N)
    Dminus = _compute_dminus(cdfvals, N)
    D = mt.stack([Dplus, Dminus]).max()
    if mode == "auto":  # Always select exact
        mode = "exact"
    if mode == "exact":
        prob = D.map_chunk(distributions.kstwo.sf, args=(N,), elementwise=True)
    elif mode == "asymp":
        prob = (D * np.sqrt(N)).map_chunk(distributions.kstwobign.sf, elementwise=True)
    else:
        # mode == 'approx'
        prob = 2 * D.map_chunk(distributions.ksone.sf, args=(N,), elementwise=True)
    prob = mt.clip(prob, 0, 1)
    return ExecutableTuple(KstestResult(D, prob))


[docs]def ks_2samp(
    data1: Union[np.ndarray, list, TileableType],
    data2: Union[np.ndarray, list, TileableType],
    alternative: str = "two-sided",
    mode: str = "auto",
):
    """
    Compute the Kolmogorov-Smirnov statistic on 2 samples.

    This is a two-sided test for the null hypothesis that 2 independent samples
    are drawn from the same continuous distribution.  The alternative hypothesis
    can be either 'two-sided' (default), 'less' or 'greater'.

    Parameters
    ----------
    data1, data2 : array_like, 1-Dimensional
        Two arrays of sample observations assumed to be drawn from a continuous
        distribution, sample sizes can be different.
    alternative : {'two-sided', 'less', 'greater'}, optional
        Defines the alternative hypothesis.
        The following options are available (default is 'two-sided'):

          * 'two-sided'
          * 'less': one-sided, see explanation in Notes
          * 'greater': one-sided, see explanation in Notes
    mode : {'auto', 'exact', 'asymp'}, optional
        Defines the method used for calculating the p-value.
        The following options are available (default is 'auto'):

          * 'auto' : use 'exact' for small size arrays, 'asymp' for large
          * 'exact' : use exact distribution of test statistic
          * 'asymp' : use asymptotic distribution of test statistic

    Returns
    -------
    statistic : float
        KS statistic.
    pvalue : float
        Two-tailed p-value.

    See Also
    --------
    kstest, ks_1samp, epps_singleton_2samp, anderson_ksamp

    Notes
    -----
    This tests whether 2 samples are drawn from the same distribution. Note
    that, like in the case of the one-sample KS test, the distribution is
    assumed to be continuous.

    In the one-sided test, the alternative is that the empirical
    cumulative distribution function F(x) of the data1 variable is "less"
    or "greater" than the empirical cumulative distribution function G(x)
    of the data2 variable, ``F(x)<=G(x)``, resp. ``F(x)>=G(x)``.

    If the KS statistic is small or the p-value is high, then we cannot
    reject the hypothesis that the distributions of the two samples
    are the same.

    If the mode is 'auto', the computation is exact if the sample sizes are
    less than 10000.  For larger sizes, the computation uses the
    Kolmogorov-Smirnov distributions to compute an approximate value.

    The 'two-sided' 'exact' computation computes the complementary probability
    and then subtracts from 1.  As such, the minimum probability it can return
    is about 1e-16.  While the algorithm itself is exact, numerical
    errors may accumulate for large sample sizes.   It is most suited to
    situations in which one of the sample sizes is only a few thousand.

    We generally follow Hodges' treatment of Drion/Gnedenko/Korolyuk [1]_.

    References
    ----------
    .. [1] Hodges, J.L. Jr.,  "The Significance Probability of the Smirnov
           Two-Sample Test," Arkiv fiur Matematik, 3, No. 43 (1958), 469-86.


    Examples
    --------
    >>> import numpy as np
    >>> from scipy import stats
    >>> import mars.tensor as mt
    >>> from mars.tensor.stats import ks_2samp
    >>> np.random.seed(12345678)  #fix random seed to get the same result
    >>> n1 = 200  # size of first sample
    >>> n2 = 300  # size of second sample

    For a different distribution, we can reject the null hypothesis since the
    pvalue is below 1%:

    >>> rvs1 = stats.norm.rvs(size=n1, loc=0., scale=1)
    >>> rvs2 = stats.norm.rvs(size=n2, loc=0.5, scale=1.5)
    >>> ks_2samp(rvs1, rvs2).execute()
    KstestResult(statistic=0.20833333333333337, pvalue=5.1292795978041816e-05)

    For a slightly different distribution, we cannot reject the null hypothesis
    at a 10% or lower alpha since the p-value at 0.144 is higher than 10%

    >>> rvs3 = stats.norm.rvs(size=n2, loc=0.01, scale=1.0)
    >>> ks_2samp(rvs1, rvs3).execute()
    KstestResult(statistic=0.10333333333333333, pvalue=0.14691437867433788)

    For an identical distribution, we cannot reject the null hypothesis since
    the p-value is high, 41%:

    >>> rvs4 = stats.norm.rvs(size=n2, loc=0.0, scale=1.0)
    >>> ks_2samp(rvs1, rvs4).execute()
    KstestResult(statistic=0.07999999999999996, pvalue=0.4115432028915931)

    """

    if mode not in ["auto", "exact", "asymp"]:
        raise ValueError(f"Invalid value for mode: {mode}")
    alternative = {"t": "two-sided", "g": "greater", "l": "less"}.get(
        alternative.lower()[0], alternative
    )
    if alternative not in ["two-sided", "less", "greater"]:
        raise ValueError(f"Invalid value for alternative: {alternative}")
    data1 = mt.asarray(data1)
    data2 = mt.asarray(data2)
    data1 = mt.sort(data1)
    data2 = mt.sort(data2)
    n1 = data1.shape[0]
    n2 = data2.shape[0]
    if min(n1, n2) == 0:
        raise ValueError("Data passed to ks_2samp must not be empty")

    data_all = mt.concatenate([data1, data2])
    # using searchsorted solves equal data problem
    cdf1 = mt.searchsorted(data1, data_all, side="right") / n1
    cdf2 = mt.searchsorted(data2, data_all, side="right") / n2
    cddiffs = cdf1 - cdf2
    minS = mt.clip(-mt.min(cddiffs), 0, 1)  # Ensure sign of minS is not negative.
    maxS = mt.max(cddiffs)
    alt2Dvalue = {"less": minS, "greater": maxS, "two-sided": mt.maximum(minS, maxS)}
    d = alt2Dvalue[alternative]
    prob = d.map_chunk(
        _calc_prob_2samp,
        args=(n1, n2, alternative, mode),
        elementwise=True,
        dtype=d.dtype,
    )

    return ExecutableTuple(Ks_2sampResult(d, prob))