Source code for mars.tensor.random.standard_t

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 1999-2020 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np

from ... import opcodes as OperandDef
from ...serialize import AnyField
from .core import TensorRandomOperandMixin, handle_array, TensorDistribution


class TensorStandardT(TensorDistribution, TensorRandomOperandMixin):
    __slots__ = '_df', '_size'
    _input_fields_ = ['_df']
    _op_type_ = OperandDef.RAND_STANDARD_T

    _df = AnyField('df')
    _func_name = 'standard_t'

    def __init__(self, size=None, state=None, dtype=None, gpu=None, **kw):
        dtype = np.dtype(dtype) if dtype is not None else dtype
        super().__init__(_size=size, _state=state, _dtype=dtype, _gpu=gpu, **kw)

    @property
    def df(self):
        return self._df

    def __call__(self, df, chunk_size=None):
        return self.new_tensor([df], None, raw_chunk_size=chunk_size)


[docs]def standard_t(random_state, df, size=None, chunk_size=None, gpu=None, dtype=None):
    r"""
    Draw samples from a standard Student's t distribution with `df` degrees
    of freedom.

    A special case of the hyperbolic distribution.  As `df` gets
    large, the result resembles that of the standard normal
    distribution (`standard_normal`).

    Parameters
    ----------
    df : float or array_like of floats
        Degrees of freedom, should be > 0.
    size : int or tuple of ints, optional
        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
        ``m * n * k`` samples are drawn.  If size is ``None`` (default),
        a single value is returned if ``df`` is a scalar.  Otherwise,
        ``mt.array(df).size`` samples are drawn.
    chunk_size : int or tuple of int or tuple of ints, optional
        Desired chunk size on each dimension
    gpu : bool, optional
        Allocate the tensor on GPU if True, False as default
    dtype : data-type, optional
      Data-type of the returned tensor.

    Returns
    -------
    out : Tensor or scalar
        Drawn samples from the parameterized standard Student's t distribution.

    Notes
    -----
    The probability density function for the t distribution is

    .. math:: P(x, df) = \frac{\Gamma(\frac{df+1}{2})}{\sqrt{\pi df}
              \Gamma(\frac{df}{2})}\Bigl( 1+\frac{x^2}{df} \Bigr)^{-(df+1)/2}

    The t test is based on an assumption that the data come from a
    Normal distribution. The t test provides a way to test whether
    the sample mean (that is the mean calculated from the data) is
    a good estimate of the true mean.

    The derivation of the t-distribution was first published in
    1908 by William Gosset while working for the Guinness Brewery
    in Dublin. Due to proprietary issues, he had to publish under
    a pseudonym, and so he used the name Student.

    References
    ----------
    .. [1] Dalgaard, Peter, "Introductory Statistics With R",
           Springer, 2002.
    .. [2] Wikipedia, "Student's t-distribution"
           http://en.wikipedia.org/wiki/Student's_t-distribution

    Examples
    --------
    From Dalgaard page 83 [1]_, suppose the daily energy intake for 11
    women in Kj is:

    >>> import mars.tensor as mt

    >>> intake = mt.array([5260., 5470, 5640, 6180, 6390, 6515, 6805, 7515, \
    ...                    7515, 8230, 8770])

    Does their energy intake deviate systematically from the recommended
    value of 7725 kJ?

    We have 10 degrees of freedom, so is the sample mean within 95% of the
    recommended value?

    >>> s = mt.random.standard_t(10, size=100000)
    >>> mt.mean(intake).execute()
    6753.636363636364
    >>> intake.std(ddof=1).execute()
    1142.1232221373727

    Calculate the t statistic, setting the ddof parameter to the unbiased
    value so the divisor in the standard deviation will be degrees of
    freedom, N-1.

    >>> t = (mt.mean(intake)-7725)/(intake.std(ddof=1)/mt.sqrt(len(intake)))
    >>> import matplotlib.pyplot as plt
    >>> h = plt.hist(s.execute(), bins=100, normed=True)

    For a one-sided t-test, how far out in the distribution does the t
    statistic appear?

    >>> (mt.sum(s<t) / float(len(s))).execute()
    0.0090699999999999999  #random

    So the p-value is about 0.009, which says the null hypothesis has a
    probability of about 99% of being true.
    """
    if dtype is None:
        dtype = np.random.RandomState().standard_t(
            handle_array(df), size=(0,)).dtype
    size = random_state._handle_size(size)
    op = TensorStandardT(size=size, state=random_state.to_numpy(), gpu=gpu, dtype=dtype)
    return op(df, chunk_size=chunk_size)