Source code for mars.tensor.stats.power_divergence

# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import namedtuple

import numpy as np

try:
    from scipy.stats import distributions as sp_distributions
except ImportError:
    sp_distributions = None

from ...core import ExecutableTuple
from ...utils import require_not_none
from .. import special
from ..datasource import asarray


# Map from names to lambda_ values used in power_divergence().
_power_div_lambda_names = {
    "pearson": 1,
    "log-likelihood": 0,
    "freeman-tukey": -0.5,
    "mod-log-likelihood": -1,
    "neyman": -2,
    "cressie-read": 2 / 3,
}


def _count(a, axis=None):
    if axis is None:
        return a.size
    else:
        return a.shape[axis]


Power_divergenceResult = namedtuple("Power_divergenceResult", ("statistic", "pvalue"))


[docs]@require_not_none(sp_distributions) def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None): """ Cressie-Read power divergence statistic and goodness of fit test. This function tests the null hypothesis that the categorical data has the given frequencies, using the Cressie-Read power divergence statistic. Parameters ---------- f_obs : array_like Observed frequencies in each category. f_exp : array_like, optional Expected frequencies in each category. By default the categories are assumed to be equally likely. ddof : int, optional "Delta degrees of freedom": adjustment to the degrees of freedom for the p-value. The p-value is computed using a chi-squared distribution with ``k - 1 - ddof`` degrees of freedom, where `k` is the number of observed frequencies. The default value of `ddof` is 0. axis : int or None, optional The axis of the broadcast result of `f_obs` and `f_exp` along which to apply the test. If axis is None, all values in `f_obs` are treated as a single data set. Default is 0. lambda_ : float or str, optional The power in the Cressie-Read power divergence statistic. The default is 1. For convenience, `lambda_` may be assigned one of the following strings, in which case the corresponding numerical value is used:: String Value Description "pearson" 1 Pearson's chi-squared statistic. In this case, the function is equivalent to `stats.chisquare`. "log-likelihood" 0 Log-likelihood ratio. Also known as the G-test [3]_. "freeman-tukey" -1/2 Freeman-Tukey statistic. "mod-log-likelihood" -1 Modified log-likelihood ratio. "neyman" -2 Neyman's statistic. "cressie-read" 2/3 The power recommended in [5]_. Returns ------- statistic : float or ndarray The Cressie-Read power divergence test statistic. The value is a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D. pvalue : float or ndarray The p-value of the test. The value is a float if `ddof` and the return value `stat` are scalars. See Also -------- chisquare Notes ----- This test is invalid when the observed or expected frequencies in each category are too small. A typical rule is that all of the observed and expected frequencies should be at least 5. When `lambda_` is less than zero, the formula for the statistic involves dividing by `f_obs`, so a warning or error may be generated if any value in `f_obs` is 0. Similarly, a warning or error may be generated if any value in `f_exp` is zero when `lambda_` >= 0. The default degrees of freedom, k-1, are for the case when no parameters of the distribution are estimated. If p parameters are estimated by efficient maximum likelihood then the correct degrees of freedom are k-1-p. If the parameters are estimated in a different way, then the dof can be between k-1-p and k-1. However, it is also possible that the asymptotic distribution is not a chisquare, in which case this test is not appropriate. This function handles masked arrays. If an element of `f_obs` or `f_exp` is masked, then data at that position is ignored, and does not count towards the size of the data set. .. versionadded:: 0.13.0 References ---------- .. [1] Lowry, Richard. "Concepts and Applications of Inferential Statistics". Chapter 8. https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html .. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test .. [3] "G-test", https://en.wikipedia.org/wiki/G-test .. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and practice of statistics in biological research", New York: Freeman (1981) .. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984), pp. 440-464. Examples -------- (See `chisquare` for more examples.) When just `f_obs` is given, it is assumed that the expected frequencies are uniform and given by the mean of the observed frequencies. Here we perform a G-test (i.e. use the log-likelihood ratio statistic): >>> import mars.tensor as mt >>> from mars.tensor.stats import power_divergence >>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood').execute() (2.006573162632538, 0.84823476779463769) The expected frequencies can be given with the `f_exp` argument: >>> power_divergence([16, 18, 16, 14, 12, 12], ... f_exp=[16, 16, 16, 16, 16, 8], ... lambda_='log-likelihood').execute() (3.3281031458963746, 0.6495419288047497) When `f_obs` is 2-D, by default the test is applied to each column. >>> obs = mt.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T >>> obs.shape (6, 2) >>> power_divergence(obs, lambda_="log-likelihood").execute() (array([ 2.00657316, 6.77634498]), array([ 0.84823477, 0.23781225])) By setting ``axis=None``, the test is applied to all data in the array, which is equivalent to applying the test to the flattened array. >>> power_divergence(obs, axis=None).execute() (23.31034482758621, 0.015975692534127565) >>> power_divergence(obs.ravel()).execute() (23.31034482758621, 0.015975692534127565) `ddof` is the change to make to the default degrees of freedom. >>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1).execute() (2.0, 0.73575888234288467) `f_obs` and `f_exp` are also broadcast. In the following, `f_obs` has shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting `f_obs` and `f_exp` has shape (2, 6). To compute the desired chi-squared statistics, we must use ``axis=1``: >>> power_divergence([16, 18, 16, 14, 12, 12], ... f_exp=[[16, 16, 16, 16, 16, 8], ... [8, 20, 20, 16, 12, 12]], ... axis=1) (array([ 3.5 , 9.25]), array([ 0.62338763, 0.09949846])) """ # Convert the input argument `lambda_` to a numerical value. if isinstance(lambda_, str): if lambda_ not in _power_div_lambda_names: names = repr(list(_power_div_lambda_names.keys()))[1:-1] raise ValueError( "invalid string for lambda_: {0!r}. Valid strings " "are {1}".format(lambda_, names) ) lambda_ = _power_div_lambda_names[lambda_] elif lambda_ is None: lambda_ = 1 f_obs = asarray(f_obs) if f_exp is not None: f_exp = asarray(f_exp) else: f_exp = f_obs.mean(axis=axis, keepdims=True) # `terms` is the array of terms that are summed along `axis` to create # the test statistic. We use some specialized code for a few special # cases of lambda_. if lambda_ == 1: # Pearson's chi-squared statistic terms = (f_obs.astype(np.float64) - f_exp) ** 2 / f_exp elif lambda_ == 0: # Log-likelihood ratio (i.e. G-test) terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp) elif lambda_ == -1: # Modified log-likelihood ratio terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs) else: # General Cressie-Read power divergence. terms = f_obs * ((f_obs / f_exp) ** lambda_ - 1) terms /= 0.5 * lambda_ * (lambda_ + 1) stat = terms.sum(axis=axis) num_obs = _count(terms, axis=axis) # we decide not to support ddof for multiple dimensions # ddof = asarray(ddof) p = stat.map_chunk( sp_distributions.chi2.sf, (num_obs - 1 - ddof,), elementwise=True ) return ExecutableTuple(Power_divergenceResult(stat, p))