Source code for mars.tensor.random.choice

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 1999-2020 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from numbers import Integral

import numpy as np

from ... import opcodes as OperandDef
from ...config import options
from ...serialize import ValueType, AnyField, KeyField, BoolField, TupleField
from ...tiles import TilesError
from ...utils import check_chunks_unknown_shape, ceildiv, recursive_tile
from ..operands import TensorOperandMixin
from ..core import TENSOR_TYPE, TENSOR_CHUNK_TYPE, TensorOrder
from ..datasource import arange, array
from ..utils import decide_chunk_sizes, normalize_chunk_sizes, gen_random_seeds
from ..array_utils import as_same_device, device
from .core import TensorRandomOperand, RandomState


class TensorChoice(TensorRandomOperand, TensorOperandMixin):
    _op_type_ = OperandDef.RAND_CHOICE

    _a = AnyField('a')
    _size = TupleField('size', ValueType.int64)
    _replace = BoolField('replace')
    _p = KeyField('p')

    def __init__(self, a=None, size=None, replace=None, p=None,
                 state=None, seed=None, dtype=None, gpu=None, **kw):
        super().__init__(_a=a, _size=size, _replace=replace, _p=p,
                         _state=state, _seed=seed, _dtype=dtype, _gpu=gpu, **kw)

    @property
    def a(self):
        return self._a

    @property
    def size(self):
        return self._size

    @property
    def replace(self):
        return self._replace

    @property
    def p(self):
        return self._p

    def _set_inputs(self, inputs):
        super()._set_inputs(inputs)
        if isinstance(self._a, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
            self._a = self._inputs[0]
        if isinstance(self._p, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
            self._p = self._inputs[-1]

    def __call__(self, a, p, chunk_size=None):
        inputs = []
        if isinstance(a, TENSOR_TYPE):
            inputs.append(a)
        if isinstance(p, TENSOR_TYPE):
            inputs.append(p)
        return self.new_tensor(inputs, shape=self._size,
                               raw_chunk_size=chunk_size,
                               order=TensorOrder.C_ORDER)

    @classmethod
    def _tile_one_chunk(cls, op, a, p):
        out = op.outputs[0]
        chunk_op = op.copy().reset_key()
        chunk_op._seed = gen_random_seeds(1, op.state)[0]
        chunk_inputs = []
        if isinstance(a, TENSOR_TYPE):
            chunk_op._a = a.chunks[0]
            chunk_inputs.append(chunk_op.a)
        else:
            chunk_op._a = a
        if isinstance(p, TENSOR_TYPE):
            chunk_op._p = p.chunks[0]
            chunk_inputs.append(chunk_op.p)
        else:
            chunk_op._p = p
        chunk = chunk_op.new_chunk(chunk_inputs, shape=out.shape,
                                   index=(0,) * out.ndim,
                                   order=out.order)

        new_op = op.copy()
        return new_op.new_tensors(op.inputs, shape=out.shape,
                                  order=out.order, chunks=[chunk],
                                  nsplits=tuple((s,) for s in out.shape))

    @classmethod
    def _tile_sample_with_replacement(cls, op, a, nsplits):
        out_shape = tuple(sum(ns) for ns in nsplits)
        out_size = np.prod(out_shape).item()
        most_chunk_size = np.prod([max(ns) for ns in nsplits]).item()

        is_a_int = False
        if isinstance(a, Integral):
            is_a_int = True
            a_size = a
        else:
            a = array(a)
            a_size = a.size

        rs = RandomState.from_numpy(op.state)

        if is_a_int:
            # the indices is just the result
            ret = rs.randint(a_size, size=out_shape, chunk_size=nsplits)
        else:
            # gen indices first, need to be flattened
            indices = rs.randint(a_size, size=out_size, chunk_size=most_chunk_size)
            # get result via fancy indexing
            ret = a[indices]
            if len(out_shape) > 1:
                # reshape back if out's ndim > 1
                ret = ret.reshape(out_shape)
            ret = ret.rechunk(nsplits)

        return [recursive_tile(ret)]

    @classmethod
    def _tile_sample_without_replacement(cls, op, a, nsplits):
        from ..base import searchsorted
        from ..merge.stack import TensorStack
        from ..indexing.getitem import TensorIndex

        out = op.outputs[0]
        out_shape = tuple(sum(ns) for ns in nsplits)
        # to sample count
        m = np.prod(out_shape).item()

        if isinstance(a, Integral):
            a_size = a
            a = arange(a)
        else:
            a = array(a)
            a_size = a.size
        a = a._inplace_tile()

        if any(cs < m for cs in a.nsplits[0]):
            # make sure all chunk > m
            n_chunk = min(max(a.size // (m + 1), 1), a.chunk_shape[0])
            chunk_size = ceildiv(a.size, n_chunk)
            chunk_sizes = normalize_chunk_sizes(a.size, chunk_size)[0]
            if chunk_sizes[-1] < m and len(chunk_sizes) > 1:
                # the last chunk may still less than m
                # merge it into previous one
                chunk_sizes[-2] += chunk_sizes[-1]
                chunk_sizes = chunk_sizes[:-1]
            a = a.rechunk({0: chunk_sizes})._inplace_tile()
            if len(chunk_sizes) == 1:
                return cls._tile_one_chunk(op, a, None)

        # for each chunk in a, do regular sampling
        sampled_chunks = []
        sample_seeds = gen_random_seeds(len(a.chunks), op.state)
        for seed, chunk in zip(sample_seeds, a.chunks):
            chunk_op = op.copy().reset_key()
            chunk_op._a = chunk
            chunk_op._size = (m,)
            chunk_op._seed = seed
            sampled_chunk = chunk_op.new_chunk([chunk], shape=(m,),
                                               order=out.order,
                                               index=chunk.index)
            sampled_chunks.append(sampled_chunk)

        if len(sampled_chunks) == 1:
            out_chunk = sampled_chunks[0]
        else:
            stacked_chunk = TensorStack(axis=0, dtype=sampled_chunks[0].dtype).new_chunk(
                sampled_chunks, shape=(len(a.chunks), m), order=TensorOrder.C_ORDER)

            # gen indices with length m from 0...a.size
            state = RandomState.from_numpy(op.state)
            indices = state.randint(a_size, size=(m,))
            cum_offsets = np.cumsum(a.nsplits[0])
            ind = recursive_tile(searchsorted(cum_offsets, indices, side='right'))
            ind_chunk = ind.chunks[0]

            # do fancy index to find result
            indexes = (ind_chunk, arange(m)._inplace_tile().chunks[0])
            out_chunk = TensorIndex(dtype=stacked_chunk.dtype, indexes=indexes).new_chunk(
                [stacked_chunk] + list(indexes), shape=(m,), order=TensorOrder.C_ORDER)

        ret = op.copy().new_tensor(op.inputs, shape=(m,), order=out.order,
                                   nsplits=((m,),), chunks=[out_chunk])
        if len(out_shape) > 0:
            ret = ret.reshape(out_shape)._inplace_tile()
        ret = ret.rechunk(nsplits)._inplace_tile()
        return [recursive_tile(ret)]

    @classmethod
    def tile(cls, op):
        check_chunks_unknown_shape(op.inputs, TilesError)

        out = op.outputs[0]
        chunk_size = out.extra_params.raw_chunk_size or options.chunk_size
        nsplits = decide_chunk_sizes(out.shape, chunk_size, out.dtype.itemsize)
        inputs = op.inputs

        a, p = op.a, op.p
        if p is not None:
            # we cannot handle p in a parallel fashion
            inputs = []
            if isinstance(a, TENSOR_TYPE):
                a = a.rechunk(a.shape)._inplace_tile()
                inputs.append(a)
            p = p.rechunk(p.shape)._inplace_tile()
            inputs.append(p)

            # ignore nsplits if p is specified
            nsplits = ((s,) for s in out.shape)

        # all inputs and outputs has 1 chunk
        if all(len(inp.chunks) == 1 for inp in inputs) and \
                all(len(ns) == 1 for ns in nsplits):
            return cls._tile_one_chunk(op, a, p)

        if op.replace:
            return cls._tile_sample_with_replacement(op, a, nsplits)
        else:
            return cls._tile_sample_without_replacement(op, a, nsplits)

    @classmethod
    def execute(cls, ctx, op):
        inputs, device_id, xp = as_same_device(
            [ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True)
        if isinstance(op.a, TENSOR_CHUNK_TYPE):
            a = inputs[0]
        else:
            a = op.a
        if isinstance(op.p, TENSOR_CHUNK_TYPE):
            p = inputs[-1]
        else:
            p = op.p

        with device(device_id):
            rs = xp.random.RandomState(op.seed)
            ctx[op.outputs[0].key] = rs.choice(
                a, size=op.size, replace=op.replace, p=p)


[docs]def choice(random_state, a, size=None, replace=True, p=None, chunk_size=None, gpu=None):
    """
    Generates a random sample from a given 1-D array

    Parameters
    -----------
    a : 1-D array-like or int
        If a tensor, a random sample is generated from its elements.
        If an int, the random sample is generated as if a were mt.arange(a)
    size : int or tuple of ints, optional
        Output shape.  If the given shape is, e.g., ``(m, n, k)``, then
        ``m * n * k`` samples are drawn.  Default is None, in which case a
        single value is returned.
    replace : boolean, optional
        Whether the sample is with or without replacement
    p : 1-D array-like, optional
        The probabilities associated with each entry in a.
        If not given the sample assumes a uniform distribution over all
        entries in a.
    chunk_size : int or tuple of int or tuple of ints, optional
        Desired chunk size on each dimension
    gpu : bool, optional
        Allocate the tensor on GPU if True, False as default

    Returns
    --------
    samples : single item or tensor
        The generated random samples

    Raises
    -------
    ValueError
        If a is an int and less than zero, if a or p are not 1-dimensional,
        if a is an array-like of size 0, if p is not a vector of
        probabilities, if a and p have different lengths, or if
        replace=False and the sample size is greater than the population
        size

    See Also
    ---------
    randint, shuffle, permutation

    Examples
    ---------
    Generate a uniform random sample from mt.arange(5) of size 3:

    >>> import mars.tensor as mt

    >>> mt.random.choice(5, 3).execute()
    array([0, 3, 4])
    >>> #This is equivalent to mt.random.randint(0,5,3)

    Generate a non-uniform random sample from np.arange(5) of size 3:

    >>> mt.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0]).execute()
    array([3, 3, 0])

    Generate a uniform random sample from mt.arange(5) of size 3 without
    replacement:

    >>> mt.random.choice(5, 3, replace=False).execute()
    array([3,1,0])
    >>> #This is equivalent to np.random.permutation(np.arange(5))[:3]

    Generate a non-uniform random sample from mt.arange(5) of size
    3 without replacement:

    >>> mt.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0]).execute()
    array([2, 3, 0])

    Any of the above can be repeated with an arbitrary array-like
    instead of just integers. For instance:

    >>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
    >>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3])
    array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'],
          dtype='|S11')
    """

    if isinstance(a, Integral):
        if a <= 0:
            raise ValueError('a must be greater than 0')
        a_size = a
        dtype = np.random.choice(1, size=(), p=np.array([1]) if p is not None else p).dtype
    else:
        a = array(a)
        if a.ndim != 1:
            raise ValueError('a must be one dimensional')
        a_size = a.size
        dtype = a.dtype

    if p is not None:
        if not isinstance(p, TENSOR_TYPE):
            p = np.asarray(p)
            if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
                raise ValueError('probabilities do not sum to 1')
            p = array(p, chunk_size=p.size)
        if p.ndim != 1:
            raise ValueError('p must be one dimensional')

    if size is None:
        size = ()
        length = 1
    else:
        try:
            tuple(size)
            length = np.prod(size)
        except TypeError:
            length = size
    if replace is False and length > a_size:
        raise ValueError("Cannot take a larger sample than population when 'replace=False'")

    size = random_state._handle_size(size)
    op = TensorChoice(a=a, p=p, state=random_state.to_numpy(),
                      replace=replace, size=size, dtype=dtype, gpu=gpu)
    return op(a, p, chunk_size=chunk_size)