#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 1999-2020 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from numbers import Integral
import numpy as np
from ... import opcodes as OperandDef
from ...config import options
from ...serialize import ValueType, AnyField, KeyField, BoolField, TupleField
from ...tiles import TilesError
from ...utils import check_chunks_unknown_shape, ceildiv, recursive_tile
from ..operands import TensorOperandMixin
from ..core import TENSOR_TYPE, TENSOR_CHUNK_TYPE, TensorOrder
from ..datasource import arange, array
from ..utils import decide_chunk_sizes, normalize_chunk_sizes, gen_random_seeds
from ..array_utils import as_same_device, device
from .core import TensorRandomOperand, RandomState
class TensorChoice(TensorRandomOperand, TensorOperandMixin):
_op_type_ = OperandDef.RAND_CHOICE
_a = AnyField('a')
_size = TupleField('size', ValueType.int64)
_replace = BoolField('replace')
_p = KeyField('p')
def __init__(self, a=None, size=None, replace=None, p=None,
state=None, seed=None, dtype=None, gpu=None, **kw):
super().__init__(_a=a, _size=size, _replace=replace, _p=p,
_state=state, _seed=seed, _dtype=dtype, _gpu=gpu, **kw)
@property
def a(self):
return self._a
@property
def size(self):
return self._size
@property
def replace(self):
return self._replace
@property
def p(self):
return self._p
def _set_inputs(self, inputs):
super()._set_inputs(inputs)
if isinstance(self._a, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
self._a = self._inputs[0]
if isinstance(self._p, (TENSOR_TYPE, TENSOR_CHUNK_TYPE)):
self._p = self._inputs[-1]
def __call__(self, a, p, chunk_size=None):
inputs = []
if isinstance(a, TENSOR_TYPE):
inputs.append(a)
if isinstance(p, TENSOR_TYPE):
inputs.append(p)
return self.new_tensor(inputs, shape=self._size,
raw_chunk_size=chunk_size,
order=TensorOrder.C_ORDER)
@classmethod
def _tile_one_chunk(cls, op, a, p):
out = op.outputs[0]
chunk_op = op.copy().reset_key()
chunk_op._seed = gen_random_seeds(1, op.state)[0]
chunk_inputs = []
if isinstance(a, TENSOR_TYPE):
chunk_op._a = a.chunks[0]
chunk_inputs.append(chunk_op.a)
else:
chunk_op._a = a
if isinstance(p, TENSOR_TYPE):
chunk_op._p = p.chunks[0]
chunk_inputs.append(chunk_op.p)
else:
chunk_op._p = p
chunk = chunk_op.new_chunk(chunk_inputs, shape=out.shape,
index=(0,) * out.ndim,
order=out.order)
new_op = op.copy()
return new_op.new_tensors(op.inputs, shape=out.shape,
order=out.order, chunks=[chunk],
nsplits=tuple((s,) for s in out.shape))
@classmethod
def _tile_sample_with_replacement(cls, op, a, nsplits):
out_shape = tuple(sum(ns) for ns in nsplits)
out_size = np.prod(out_shape).item()
most_chunk_size = np.prod([max(ns) for ns in nsplits]).item()
is_a_int = False
if isinstance(a, Integral):
is_a_int = True
a_size = a
else:
a = array(a)
a_size = a.size
rs = RandomState.from_numpy(op.state)
if is_a_int:
# the indices is just the result
ret = rs.randint(a_size, size=out_shape, chunk_size=nsplits)
else:
# gen indices first, need to be flattened
indices = rs.randint(a_size, size=out_size, chunk_size=most_chunk_size)
# get result via fancy indexing
ret = a[indices]
if len(out_shape) > 1:
# reshape back if out's ndim > 1
ret = ret.reshape(out_shape)
ret = ret.rechunk(nsplits)
return [recursive_tile(ret)]
@classmethod
def _tile_sample_without_replacement(cls, op, a, nsplits):
from ..base import searchsorted
from ..merge.stack import TensorStack
from ..indexing.getitem import TensorIndex
out = op.outputs[0]
out_shape = tuple(sum(ns) for ns in nsplits)
# to sample count
m = np.prod(out_shape).item()
if isinstance(a, Integral):
a_size = a
a = arange(a)
else:
a = array(a)
a_size = a.size
a = a._inplace_tile()
if any(cs < m for cs in a.nsplits[0]):
# make sure all chunk > m
n_chunk = min(max(a.size // (m + 1), 1), a.chunk_shape[0])
chunk_size = ceildiv(a.size, n_chunk)
chunk_sizes = normalize_chunk_sizes(a.size, chunk_size)[0]
if chunk_sizes[-1] < m and len(chunk_sizes) > 1:
# the last chunk may still less than m
# merge it into previous one
chunk_sizes[-2] += chunk_sizes[-1]
chunk_sizes = chunk_sizes[:-1]
a = a.rechunk({0: chunk_sizes})._inplace_tile()
if len(chunk_sizes) == 1:
return cls._tile_one_chunk(op, a, None)
# for each chunk in a, do regular sampling
sampled_chunks = []
sample_seeds = gen_random_seeds(len(a.chunks), op.state)
for seed, chunk in zip(sample_seeds, a.chunks):
chunk_op = op.copy().reset_key()
chunk_op._a = chunk
chunk_op._size = (m,)
chunk_op._seed = seed
sampled_chunk = chunk_op.new_chunk([chunk], shape=(m,),
order=out.order,
index=chunk.index)
sampled_chunks.append(sampled_chunk)
if len(sampled_chunks) == 1:
out_chunk = sampled_chunks[0]
else:
stacked_chunk = TensorStack(axis=0, dtype=sampled_chunks[0].dtype).new_chunk(
sampled_chunks, shape=(len(a.chunks), m), order=TensorOrder.C_ORDER)
# gen indices with length m from 0...a.size
state = RandomState.from_numpy(op.state)
indices = state.randint(a_size, size=(m,))
cum_offsets = np.cumsum(a.nsplits[0])
ind = recursive_tile(searchsorted(cum_offsets, indices, side='right'))
ind_chunk = ind.chunks[0]
# do fancy index to find result
indexes = (ind_chunk, arange(m)._inplace_tile().chunks[0])
out_chunk = TensorIndex(dtype=stacked_chunk.dtype, indexes=indexes).new_chunk(
[stacked_chunk] + list(indexes), shape=(m,), order=TensorOrder.C_ORDER)
ret = op.copy().new_tensor(op.inputs, shape=(m,), order=out.order,
nsplits=((m,),), chunks=[out_chunk])
if len(out_shape) > 0:
ret = ret.reshape(out_shape)._inplace_tile()
ret = ret.rechunk(nsplits)._inplace_tile()
return [recursive_tile(ret)]
@classmethod
def tile(cls, op):
check_chunks_unknown_shape(op.inputs, TilesError)
out = op.outputs[0]
chunk_size = out.extra_params.raw_chunk_size or options.chunk_size
nsplits = decide_chunk_sizes(out.shape, chunk_size, out.dtype.itemsize)
inputs = op.inputs
a, p = op.a, op.p
if p is not None:
# we cannot handle p in a parallel fashion
inputs = []
if isinstance(a, TENSOR_TYPE):
a = a.rechunk(a.shape)._inplace_tile()
inputs.append(a)
p = p.rechunk(p.shape)._inplace_tile()
inputs.append(p)
# ignore nsplits if p is specified
nsplits = ((s,) for s in out.shape)
# all inputs and outputs has 1 chunk
if all(len(inp.chunks) == 1 for inp in inputs) and \
all(len(ns) == 1 for ns in nsplits):
return cls._tile_one_chunk(op, a, p)
if op.replace:
return cls._tile_sample_with_replacement(op, a, nsplits)
else:
return cls._tile_sample_without_replacement(op, a, nsplits)
@classmethod
def execute(cls, ctx, op):
inputs, device_id, xp = as_same_device(
[ctx[inp.key] for inp in op.inputs], device=op.device, ret_extra=True)
if isinstance(op.a, TENSOR_CHUNK_TYPE):
a = inputs[0]
else:
a = op.a
if isinstance(op.p, TENSOR_CHUNK_TYPE):
p = inputs[-1]
else:
p = op.p
with device(device_id):
rs = xp.random.RandomState(op.seed)
ctx[op.outputs[0].key] = rs.choice(
a, size=op.size, replace=op.replace, p=p)
[docs]def choice(random_state, a, size=None, replace=True, p=None, chunk_size=None, gpu=None):
"""
Generates a random sample from a given 1-D array
Parameters
-----------
a : 1-D array-like or int
If a tensor, a random sample is generated from its elements.
If an int, the random sample is generated as if a were mt.arange(a)
size : int or tuple of ints, optional
Output shape. If the given shape is, e.g., ``(m, n, k)``, then
``m * n * k`` samples are drawn. Default is None, in which case a
single value is returned.
replace : boolean, optional
Whether the sample is with or without replacement
p : 1-D array-like, optional
The probabilities associated with each entry in a.
If not given the sample assumes a uniform distribution over all
entries in a.
chunk_size : int or tuple of int or tuple of ints, optional
Desired chunk size on each dimension
gpu : bool, optional
Allocate the tensor on GPU if True, False as default
Returns
--------
samples : single item or tensor
The generated random samples
Raises
-------
ValueError
If a is an int and less than zero, if a or p are not 1-dimensional,
if a is an array-like of size 0, if p is not a vector of
probabilities, if a and p have different lengths, or if
replace=False and the sample size is greater than the population
size
See Also
---------
randint, shuffle, permutation
Examples
---------
Generate a uniform random sample from mt.arange(5) of size 3:
>>> import mars.tensor as mt
>>> mt.random.choice(5, 3).execute()
array([0, 3, 4])
>>> #This is equivalent to mt.random.randint(0,5,3)
Generate a non-uniform random sample from np.arange(5) of size 3:
>>> mt.random.choice(5, 3, p=[0.1, 0, 0.3, 0.6, 0]).execute()
array([3, 3, 0])
Generate a uniform random sample from mt.arange(5) of size 3 without
replacement:
>>> mt.random.choice(5, 3, replace=False).execute()
array([3,1,0])
>>> #This is equivalent to np.random.permutation(np.arange(5))[:3]
Generate a non-uniform random sample from mt.arange(5) of size
3 without replacement:
>>> mt.random.choice(5, 3, replace=False, p=[0.1, 0, 0.3, 0.6, 0]).execute()
array([2, 3, 0])
Any of the above can be repeated with an arbitrary array-like
instead of just integers. For instance:
>>> aa_milne_arr = ['pooh', 'rabbit', 'piglet', 'Christopher']
>>> np.random.choice(aa_milne_arr, 5, p=[0.5, 0.1, 0.1, 0.3])
array(['pooh', 'pooh', 'pooh', 'Christopher', 'piglet'],
dtype='|S11')
"""
if isinstance(a, Integral):
if a <= 0:
raise ValueError('a must be greater than 0')
a_size = a
dtype = np.random.choice(1, size=(), p=np.array([1]) if p is not None else p).dtype
else:
a = array(a)
if a.ndim != 1:
raise ValueError('a must be one dimensional')
a_size = a.size
dtype = a.dtype
if p is not None:
if not isinstance(p, TENSOR_TYPE):
p = np.asarray(p)
if not np.isclose(p.sum(), 1, rtol=1e-7, atol=0):
raise ValueError('probabilities do not sum to 1')
p = array(p, chunk_size=p.size)
if p.ndim != 1:
raise ValueError('p must be one dimensional')
if size is None:
size = ()
length = 1
else:
try:
tuple(size)
length = np.prod(size)
except TypeError:
length = size
if replace is False and length > a_size:
raise ValueError("Cannot take a larger sample than population when 'replace=False'")
size = random_state._handle_size(size)
op = TensorChoice(a=a, p=p, state=random_state.to_numpy(),
replace=replace, size=size, dtype=dtype, gpu=gpu)
return op(a, p, chunk_size=chunk_size)