# Source code for mars.tensor.stats.power_divergence

```# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

from collections import namedtuple

import numpy as np

try:
from scipy.stats import distributions as sp_distributions
except ImportError:
sp_distributions = None

from ...core import ExecutableTuple
from ...utils import require_not_none
from .. import special
from ..datasource import asarray

# Map from names to lambda_ values used in power_divergence().
_power_div_lambda_names = {
"pearson": 1,
"log-likelihood": 0,
"freeman-tukey": -0.5,
"mod-log-likelihood": -1,
"neyman": -2,
}

def _count(a, axis=None):
if axis is None:
return a.size
else:
return a.shape[axis]

Power_divergenceResult = namedtuple("Power_divergenceResult", ("statistic", "pvalue"))

[docs]@require_not_none(sp_distributions)
def power_divergence(f_obs, f_exp=None, ddof=0, axis=0, lambda_=None):
"""
Cressie-Read power divergence statistic and goodness of fit test.

This function tests the null hypothesis that the categorical data
has the given frequencies, using the Cressie-Read power divergence
statistic.

Parameters
----------
f_obs : array_like
Observed frequencies in each category.
f_exp : array_like, optional
Expected frequencies in each category.  By default the categories are
assumed to be equally likely.
ddof : int, optional
"Delta degrees of freedom": adjustment to the degrees of freedom
for the p-value.  The p-value is computed using a chi-squared
distribution with ``k - 1 - ddof`` degrees of freedom, where `k`
is the number of observed frequencies.  The default value of `ddof`
is 0.
axis : int or None, optional
The axis of the broadcast result of `f_obs` and `f_exp` along which to
apply the test.  If axis is None, all values in `f_obs` are treated
as a single data set.  Default is 0.
lambda_ : float or str, optional
The power in the Cressie-Read power divergence statistic.  The default
is 1.  For convenience, `lambda_` may be assigned one of the following
strings, in which case the corresponding numerical value is used::

String              Value   Description
"pearson"             1     Pearson's chi-squared statistic.
In this case, the function is
equivalent to `stats.chisquare`.
"log-likelihood"      0     Log-likelihood ratio. Also known as
the G-test [3]_.
"freeman-tukey"      -1/2   Freeman-Tukey statistic.
"mod-log-likelihood" -1     Modified log-likelihood ratio.
"neyman"             -2     Neyman's statistic.
"cressie-read"        2/3   The power recommended in [5]_.

Returns
-------
statistic : float or ndarray
The Cressie-Read power divergence test statistic.  The value is
a float if `axis` is None or if` `f_obs` and `f_exp` are 1-D.
pvalue : float or ndarray
The p-value of the test.  The value is a float if `ddof` and the
return value `stat` are scalars.

--------
chisquare

Notes
-----
This test is invalid when the observed or expected frequencies in each
category are too small.  A typical rule is that all of the observed
and expected frequencies should be at least 5.

When `lambda_` is less than zero, the formula for the statistic involves
dividing by `f_obs`, so a warning or error may be generated if any value
in `f_obs` is 0.

Similarly, a warning or error may be generated if any value in `f_exp` is
zero when `lambda_` >= 0.

The default degrees of freedom, k-1, are for the case when no parameters
of the distribution are estimated. If p parameters are estimated by
efficient maximum likelihood then the correct degrees of freedom are
k-1-p. If the parameters are estimated in a different way, then the
dof can be between k-1-p and k-1. However, it is also possible that
the asymptotic distribution is not a chisquare, in which case this
test is not appropriate.

This function handles masked arrays.  If an element of `f_obs` or `f_exp`
is masked, then data at that position is ignored, and does not count
towards the size of the data set.

References
----------
.. [1] Lowry, Richard.  "Concepts and Applications of Inferential
Statistics". Chapter 8.
https://web.archive.org/web/20171015035606/http://faculty.vassar.edu/lowry/ch8pt1.html
.. [2] "Chi-squared test", https://en.wikipedia.org/wiki/Chi-squared_test
.. [3] "G-test", https://en.wikipedia.org/wiki/G-test
.. [4] Sokal, R. R. and Rohlf, F. J. "Biometry: the principles and
practice of statistics in biological research", New York: Freeman
(1981)
.. [5] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
pp. 440-464.

Examples
--------
(See `chisquare` for more examples.)

When just `f_obs` is given, it is assumed that the expected frequencies
are uniform and given by the mean of the observed frequencies.  Here we
perform a G-test (i.e. use the log-likelihood ratio statistic):

>>> import mars.tensor as mt
>>> from mars.tensor.stats import power_divergence
>>> power_divergence([16, 18, 16, 14, 12, 12], lambda_='log-likelihood').execute()
(2.006573162632538, 0.84823476779463769)

The expected frequencies can be given with the `f_exp` argument:

>>> power_divergence([16, 18, 16, 14, 12, 12],
...                  f_exp=[16, 16, 16, 16, 16, 8],
...                  lambda_='log-likelihood').execute()
(3.3281031458963746, 0.6495419288047497)

When `f_obs` is 2-D, by default the test is applied to each column.

>>> obs = mt.array([[16, 18, 16, 14, 12, 12], [32, 24, 16, 28, 20, 24]]).T
>>> obs.shape
(6, 2)
>>> power_divergence(obs, lambda_="log-likelihood").execute()
(array([ 2.00657316,  6.77634498]), array([ 0.84823477,  0.23781225]))

By setting ``axis=None``, the test is applied to all data in the array,
which is equivalent to applying the test to the flattened array.

>>> power_divergence(obs, axis=None).execute()
(23.31034482758621, 0.015975692534127565)
>>> power_divergence(obs.ravel()).execute()
(23.31034482758621, 0.015975692534127565)

`ddof` is the change to make to the default degrees of freedom.

>>> power_divergence([16, 18, 16, 14, 12, 12], ddof=1).execute()
(2.0, 0.73575888234288467)

`f_obs` and `f_exp` are also broadcast.  In the following, `f_obs` has
shape (6,) and `f_exp` has shape (2, 6), so the result of broadcasting
`f_obs` and `f_exp` has shape (2, 6).  To compute the desired chi-squared
statistics, we must use ``axis=1``:

>>> power_divergence([16, 18, 16, 14, 12, 12],
...                  f_exp=[[16, 16, 16, 16, 16, 8],
...                         [8, 20, 20, 16, 12, 12]],
...                  axis=1)
(array([ 3.5 ,  9.25]), array([ 0.62338763,  0.09949846]))
"""
# Convert the input argument `lambda_` to a numerical value.
if isinstance(lambda_, str):
if lambda_ not in _power_div_lambda_names:
names = repr(list(_power_div_lambda_names.keys()))[1:-1]
raise ValueError(
"invalid string for lambda_: {0!r}.  Valid strings "
"are {1}".format(lambda_, names)
)
lambda_ = _power_div_lambda_names[lambda_]
elif lambda_ is None:
lambda_ = 1

f_obs = asarray(f_obs)

if f_exp is not None:
f_exp = asarray(f_exp)
else:
f_exp = f_obs.mean(axis=axis, keepdims=True)

# `terms` is the array of terms that are summed along `axis` to create
# the test statistic.  We use some specialized code for a few special
# cases of lambda_.
if lambda_ == 1:
# Pearson's chi-squared statistic
terms = (f_obs.astype(np.float64) - f_exp) ** 2 / f_exp
elif lambda_ == 0:
# Log-likelihood ratio (i.e. G-test)
terms = 2.0 * special.xlogy(f_obs, f_obs / f_exp)
elif lambda_ == -1:
# Modified log-likelihood ratio
terms = 2.0 * special.xlogy(f_exp, f_exp / f_obs)
else:
terms = f_obs * ((f_obs / f_exp) ** lambda_ - 1)
terms /= 0.5 * lambda_ * (lambda_ + 1)

stat = terms.sum(axis=axis)

num_obs = _count(terms, axis=axis)
# we decide not to support ddof for multiple dimensions
# ddof = asarray(ddof)
p = stat.map_chunk(
sp_distributions.chi2.sf, (num_obs - 1 - ddof,), elementwise=True
)

return ExecutableTuple(Power_divergenceResult(stat, p))
```