Source code for mars.tensor.stats.ttest

# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import namedtuple

import numpy as np
from scipy import __version__ as sp_version
from scipy.stats import (
    ttest_ind as sp_ttest_ind,
    ttest_ind_from_stats as sp_ttest_ind_from_stats,
    ttest_rel as sp_ttest_rel,
    ttest_1samp as sp_ttest_1samp,
)
from scipy.stats import distributions as sp_distributions

from ...core import ExecutableTuple
from ...lib.version import parse as parse_version
from ..arithmetic import (
    divide as mt_divide,
    sqrt as mt_sqrt,
    absolute as mt_abs,
    isnan as mt_isnan,
)
from ..base import where as mt_where
from ..reduction import (
    var as mt_var,
    mean as mt_mean,
)
from ..utils import implement_scipy


def _equal_var_ttest_denom(v1, n1, v2, n2):
    df = n1 + n2 - 2.0
    svar = ((n1 - 1) * v1 + (n2 - 1) * v2) / df
    denom = mt_sqrt(svar * (1.0 / n1 + 1.0 / n2))  # XXX: np -> da
    return df, denom


def _unequal_var_ttest_denom(v1, n1, v2, n2):
    vn1 = v1 / n1
    vn2 = v2 / n2
    with np.errstate(divide="ignore", invalid="ignore"):
        df = (vn1 + vn2) ** 2 / (vn1**2 / (n1 - 1) + vn2**2 / (n2 - 1))

    # If df is undefined, variances are zero (assumes n1 > 0 & n2 > 0).
    # Hence it doesn't matter what df is as long as it's not NaN.
    df = mt_where(mt_isnan(df), 1, df)
    denom = mt_sqrt(vn1 + vn2)
    return df, denom


def _ttest_ind_from_stats(mean1, mean2, denom, df, alternative):
    d = mean1 - mean2
    with np.errstate(divide="ignore", invalid="ignore"):
        t = mt_divide(d, denom)
    t, prob = _ttest_finish(df, t, alternative)

    return t, prob


def _ttest_finish(df, t, alternative):
    """Common code between all 3 t-test functions."""
    if alternative != "two-sided" and parse_version(sp_version) < parse_version(
        "1.6.0"
    ):  # pragma: no cover
        raise ValueError("alternative must be 'two-sided' with scipy prior to 1.6.0")

    if alternative == "less":
        prob = t.map_chunk(sp_distributions.t.cdf, args=(df,))
    elif alternative == "greater":
        prob = t.map_chunk(sp_distributions.t.sf, args=(df,))
    elif alternative == "two-sided":
        prob = mt_abs(t).map_chunk(sp_distributions.t.sf, args=(df,)) * 2
    else:
        raise ValueError("alternative must be 'less', 'greater' or 'two-sided'")
    if t.ndim == 0:
        t = t[()]
    return t, prob


Ttest_1sampResult = namedtuple("Ttest_1sampResult", ("statistic", "pvalue"))


[docs]@implement_scipy(sp_ttest_1samp)
def ttest_1samp(a, popmean, axis=0, nan_policy="propagate", alternative="two-sided"):
    if nan_policy != "propagate":
        raise NotImplementedError(
            "`nan_policy` other than 'propagate' have not been implemented."
        )
    n = a.shape[axis]
    df = n - 1

    d = a.mean(axis=axis) - popmean
    v = a.var(axis=axis, ddof=1)
    denom = mt_sqrt(v / float(n))

    with np.errstate(divide="ignore", invalid="ignore"):
        t = mt_divide(d, denom)
    t, prob = _ttest_finish(df, t, alternative)
    return ExecutableTuple(Ttest_1sampResult(t, prob))


Ttest_indResult = namedtuple("Ttest_indResult", ("statistic", "pvalue"))


[docs]@implement_scipy(sp_ttest_ind)
def ttest_ind(a, b, axis=0, equal_var=True, alternative="two-sided"):
    v1 = mt_var(a, axis, ddof=1)
    v2 = mt_var(b, axis, ddof=1)
    n1 = a.shape[axis]
    n2 = b.shape[axis]

    if equal_var:
        df, denom = _equal_var_ttest_denom(v1, n1, v2, n2)
    else:
        df, denom = _unequal_var_ttest_denom(v1, n1, v2, n2)

    res = _ttest_ind_from_stats(
        mt_mean(a, axis), mt_mean(b, axis), denom, df, alternative
    )

    return ExecutableTuple(Ttest_indResult(*res))


[docs]@implement_scipy(sp_ttest_ind_from_stats)
def ttest_ind_from_stats(
    mean1, std1, nobs1, mean2, std2, nobs2, equal_var=True, alternative="two-sided"
):
    if equal_var:
        df, denom = _equal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)
    else:
        df, denom = _unequal_var_ttest_denom(std1**2, nobs1, std2**2, nobs2)

    res = _ttest_ind_from_stats(mean1, mean2, denom, df, alternative)
    return ExecutableTuple(Ttest_indResult(*res))


Ttest_relResult = namedtuple("Ttest_relResult", ("statistic", "pvalue"))


[docs]@implement_scipy(sp_ttest_rel)
def ttest_rel(a, b, axis=0, nan_policy="propagate", alternative="two-sided"):
    if nan_policy != "propagate":
        raise NotImplementedError(
            "`nan_policy` other than 'propagate' have not been implemented."
        )

    n = a.shape[axis]
    df = float(n - 1)

    d = (a - b).astype(np.float64)
    v = mt_var(d, axis, ddof=1)
    dm = mt_mean(d, axis)
    denom = mt_sqrt(v / float(n))

    with np.errstate(divide="ignore", invalid="ignore"):
        t = mt_divide(dm, denom)
    t, prob = _ttest_finish(df, t, alternative)

    return ExecutableTuple(Ttest_relResult(t, prob))