Source code for mars.learn.utils.multiclass

# Copyright 1999-2020 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections.abc import Sequence

import numpy as np
try:
    from scipy.sparse.base import spmatrix
except ImportError:  # pragma: no cover
    spmatrix = None

from ... import opcodes as OperandDef
from ... import tensor as mt
from ...core import Base, Entity
from ...serialize import KeyField, BoolField, TupleField, DataTypeField, AnyField, ListField
from ...tensor.core import TensorOrder
from ...tiles import TilesError
from ...utils import recursive_tile
from ..operands import LearnOperand, LearnOperandMixin, OutputType
from ..utils import assert_all_finite


class IsMultilabel(LearnOperand, LearnOperandMixin):
    _op_type_ = OperandDef.IS_MULTILABEL

    _y = AnyField('y')
    _unique_y = KeyField('unique_y')
    # for chunk
    _is_y_sparse = BoolField('is_y_sparse')

    def __init__(self, y=None, unique_y=None, is_y_sparse=None, **kw):
        super().__init__(_y=y, _unique_y=unique_y,
                         _is_y_sparse=is_y_sparse, **kw)
        self.output_types = [OutputType.tensor]

    @property
    def y(self):
        return self._y

    @property
    def unique_y(self):
        return self._unique_y

    @property
    def is_y_sparse(self):
        return self._is_y_sparse

    def _set_inputs(self, inputs):
        super()._set_inputs(inputs)
        if isinstance(self._y, (Base, Entity)):
            self._y = self._inputs[0]
        if self._unique_y is not None:
            self._unique_y = self._inputs[-1]

    def __call__(self, y, y_unique=None):
        inputs = [y] if isinstance(y, (Base, Entity)) else []
        if y_unique is not None:
            inputs.append(y_unique)
        return self.new_tileable(inputs, shape=(), dtype=np.dtype(bool),
                                 order=TensorOrder.C_ORDER)

    @classmethod
    def tile(cls, op):
        y = op.y
        out = op.outputs[0]

        if not (hasattr(y, 'shape') and y.ndim == 2 and y.shape[1] > 1):
            result = mt.array(False)._inplace_tile()
            return [result]
        else:
            unique_y = op.unique_y
            assert len(unique_y.chunks) == 1
            unique_y_chunk = unique_y.chunks[0]
            chunk_op = IsMultilabel(unique_y=unique_y_chunk,
                                    is_y_sparse=y.issparse())
            chunk = chunk_op.new_chunk([unique_y_chunk], dtype=out.dtype,
                                       order=out.order, index=(0,),
                                       shape=())

            new_op = op.copy()
            params = out.params
            params['nsplits'] = ()
            params['chunks'] = [chunk]
            return new_op.new_tileables(op.inputs, kws=[params])

    @classmethod
    def execute(cls, ctx, op):
        unique_y = ctx[op.unique_y.key]

        if op.is_y_sparse:
            # sparse
            result = (unique_y.size in (0, 1) and
                      (unique_y.dtype.kind in 'biu' or  # bool, int, uint
                       _is_integral_float(unique_y)))
        else:
            # dense
            labels = unique_y
            result = len(labels) < 3 and (unique_y.dtype.kind in 'biu' or  # bool, int, uint
                                          _is_integral_float(labels))

        ctx[op.outputs[0].key] = result


def _is_integral_float(y):
    return y.dtype.kind == 'f' and np.all(y.astype(int) == y)


[docs]def is_multilabel(y): """ Check if ``y`` is in a multilabel format. Parameters ---------- y : numpy array of shape [n_samples] Target values. Returns ------- out : bool, Return ``True``, if ``y`` is in a multilabel format, else ```False``. Examples -------- >>> import mars.tensor as mt >>> from mars.learn.utils.multiclass import is_multilabel >>> is_multilabel([0, 1, 0, 1]).execute() False >>> is_multilabel([[1], [0, 2], []]).execute() False >>> is_multilabel(mt.array([[1, 0], [0, 0]])).execute() True >>> is_multilabel(mt.array([[1], [0], [0]])).execute() False >>> is_multilabel(mt.array([[1, 0, 0]])).execute() True """ if not isinstance(y, (Base, Entity)): if hasattr(y, '__array__') or isinstance(y, Sequence): y = np.asarray(y) if hasattr(y, 'shape'): yt = y = mt.asarray(y) else: yt = None else: yt = y = mt.tensor(y) if hasattr(y, 'dtype') and y.dtype != np.object_: unique_y = mt.unique(y, aggregate_size=1) else: unique_y = None op = IsMultilabel(y=y, unique_y=unique_y) return op(yt, unique_y)
class TypeOfTarget(LearnOperand, LearnOperandMixin): __slots__ = ('_unique_y_chunk', '_check_all_finite_chunk') _op_type_ = OperandDef.TYPE_OF_TARGET _y = AnyField('y') # for chunks _is_multilabel = KeyField('is_multilabel') _first_value = KeyField('first_value') _check_float = KeyField('check_float') _assert_all_finite = KeyField('assert_all_finite') _unique_y = KeyField('unique_y') _y_shape = TupleField('y_shape') _y_dtype = DataTypeField('y_dtype') _checked_targets = ListField('checked_targets') def __init__(self, y=None, is_multilabel=None, first_value=None, check_float=None, assert_all_finite=None, unique_y=None, y_shape=None, y_dtype=None, checked_targets=None, **kw): super().__init__(_y=y, _is_multilabel=is_multilabel, _first_value=first_value, _check_float=check_float, _assert_all_finite=assert_all_finite, _unique_y=unique_y, _y_shape=y_shape, _y_dtype=y_dtype, _checked_targets=checked_targets, **kw) self.output_types = [OutputType.tensor] @property def y(self): return self._y @property def is_multilabel(self): return self._is_multilabel @property def first_value(self): return self._first_value @property def check_float(self): return self._check_float @property def assert_all_finite(self): return self._assert_all_finite @property def unique_y(self): return self._unique_y @property def y_shape(self): return self._y_shape @property def y_dtype(self): return self._y_dtype @property def checked_targets(self): return self._checked_targets def _set_inputs(self, inputs): super()._set_inputs(inputs) inputs_iter = iter(self._inputs) for attr in ['_y', '_is_multilabel', '_first_value', '_check_float', '_assert_all_finite', '_unique_y']: v = getattr(self, attr) if isinstance(v, (Base, Entity)): setattr(self, attr, next(inputs_iter)) def __call__(self, y): inputs = [y] if isinstance(y, (Base, Entity)) else [] return self.new_tileable(inputs, shape=(), order=TensorOrder.C_ORDER, dtype=np.dtype(object)) @classmethod def tile(cls, op): out = op.outputs[0] y = op.y chunk_inputs = [] is_multilabel_chunk = recursive_tile(is_multilabel(y)).chunks[0] chunk_inputs.append(is_multilabel_chunk) if not isinstance(y, (Base, Entity)): if hasattr(y, '__array__'): y = np.asarray(y) y = mt.asarray(y) if np.isnan(y.size): # pragma: no cover raise TilesError('y has unknown shape') chunk_op = TypeOfTarget(is_multilabel=is_multilabel_chunk, y_shape=y.shape, y_dtype=y.dtype) if y.ndim <= 2 and y.size > 0 and y.dtype == object: first_value_chunk = recursive_tile(y[(0,) * y.ndim]).chunks[0] chunk_inputs.append(first_value_chunk) chunk_op._first_value = first_value_chunk if y.dtype.kind == 'f': check_float_chunk = recursive_tile(mt.any(y != y.astype(int))).chunks[0] chunk_inputs.append(check_float_chunk) chunk_op._check_float = check_float_chunk assert_all_finite_chunk = recursive_tile(assert_all_finite(y)).chunks[0] chunk_inputs.append(assert_all_finite_chunk) chunk_op._assert_all_finite = assert_all_finite_chunk if y.size > 0: unique_y_chunk = recursive_tile(mt.unique(y, aggregate_size=1)).chunks[0] chunk_inputs.append(unique_y_chunk) chunk_op._unique_y = unique_y_chunk chunk = chunk_op.new_chunk(chunk_inputs, dtype=out.dtype, shape=out.shape, order=out.order, index=()) params = out.params params['nsplits'] = () params['chunks'] = [chunk] new_op = op.copy() return new_op.new_tileables(op.inputs, kws=[params]) @classmethod def _execute(cls, ctx, op): is_multilabel_ = ctx[op.is_multilabel.key] shape = op.y_shape ndim = len(shape) dtype = op.y_dtype if is_multilabel_: return 'multilabel-indicator' if ndim > 2 or (dtype == object and shape[0] and not isinstance(ctx[op.first_value.key], str)): return 'unknown' # [[[1, 2]]] or [obj_1] and not ["label_1"] if ndim == 2 and shape[1] == 0: return 'unknown' # [[]] if ndim == 2 and shape[1] > 1: suffix = '-multioutput' # [[1, 2], [1, 2]] else: suffix = "" # [1, 2, 3] or [[1], [2], [3]] # check float and contains non-integer float values if dtype.kind == 'f' and ctx[op.check_float.key]: # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] assert ctx[op.assert_all_finite.key] return 'continuous' + suffix if op.unique_y is not None: unique_y_len = len(ctx[op.unique_y.key]) else: # y.size == 0 unique_y_len = 0 if (unique_y_len > 2) or (ndim >= 2 and shape[1] > 1): return 'multiclass' + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] else: return 'binary' # [1, 2] or [["a"], ["b"]] @classmethod def execute(cls, ctx, op): target = cls._execute(ctx, op) if op.checked_targets is not None and len(op.checked_targets) > 0: if target not in op.checked_targets: raise ValueError(f'Unknown label type: {target}') ctx[op.outputs[0].key] = target
[docs]def type_of_target(y): """Determine the type of data indicated by the target. Note that this type is the most specific type that can be inferred. For example: * ``binary`` is more specific but compatible with ``multiclass``. * ``multiclass`` of integers is more specific but compatible with ``continuous``. * ``multilabel-indicator`` is more specific but compatible with ``multiclass-multioutput``. Parameters ---------- y : array-like Returns ------- target_type : string One of: * 'continuous': `y` is an array-like of floats that are not all integers, and is 1d or a column vector. * 'continuous-multioutput': `y` is a 2d tensor of floats that are not all integers, and both dimensions are of size > 1. * 'binary': `y` contains <= 2 discrete values and is 1d or a column vector. * 'multiclass': `y` contains more than two discrete values, is not a sequence of sequences, and is 1d or a column vector. * 'multiclass-multioutput': `y` is a 2d tensor that contains more than two discrete values, is not a sequence of sequences, and both dimensions are of size > 1. * 'multilabel-indicator': `y` is a label indicator matrix, a tensor of two dimensions with at least two columns, and at most 2 unique values. * 'unknown': `y` is array-like but none of the above, such as a 3d tensor, sequence of sequences, or a tensor of non-sequence objects. Examples -------- >>> import mars.tensor as mt >>> from mars.learn.utils.multiclass import type_of_target >>> type_of_target([0.1, 0.6]).execute() 'continuous' >>> type_of_target([1, -1, -1, 1]).execute() 'binary' >>> type_of_target(['a', 'b', 'a']).execute() 'binary' >>> type_of_target([1.0, 2.0]).execute() 'binary' >>> type_of_target([1, 0, 2]).execute() 'multiclass' >>> type_of_target([1.0, 0.0, 3.0]).execute() 'multiclass' >>> type_of_target(['a', 'b', 'c']).execute() 'multiclass' >>> type_of_target(mt.array([[1, 2], [3, 1]])).execute() 'multiclass-multioutput' >>> type_of_target([[1, 2]]).execute() 'multiclass-multioutput' >>> type_of_target(mt.array([[1.5, 2.0], [3.0, 1.6]])).execute() 'continuous-multioutput' >>> type_of_target(mt.array([[0, 1], [1, 1]])).execute() 'multilabel-indicator' """ valid_types = (Sequence, spmatrix) if spmatrix is not None else (Sequence,) valid = ((isinstance(y, valid_types) or hasattr(y, '__array__') or hasattr(y, '__mars_tensor__')) and not isinstance(y, str)) if not valid: raise ValueError(f'Expected array-like (array or non-string sequence), got {y}') sparse_pandas = (y.__class__.__name__ in ['SparseSeries', 'SparseArray']) if sparse_pandas: # pragma: no cover raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if isinstance(y, (Base, Entity)): y = mt.tensor(y) op = TypeOfTarget(y=y) return op(y)
def check_classification_targets(y): """Ensure that target y is of a non-regression type. Only the following target types (as defined in type_of_target) are allowed: 'binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences' Parameters ---------- y : array-like """ y_type = type_of_target(y) y_type.op._checked_targets = ['binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences'] return y_type