Source code for mars.learn.utils.multiclass

# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
from collections.abc import Sequence
from typing import List

import numpy as np
from scipy.sparse import spmatrix
from sklearn.utils.multiclass import (
    is_multilabel as sklearn_is_multilabel,
    type_of_target as sklearn_type_of_target,
)

from ... import opcodes as OperandDef
from ... import tensor as mt
from ...core import ENTITY_TYPE, TILEABLE_TYPE, recursive_tile
from ...core.context import get_context
from ...serialization.serializables import AnyField, ListField
from ...tensor.core import TensorOrder, TENSOR_TYPE
from ...typing import TileableType
from ...utils import has_unknown_shape
from ..operands import LearnOperand, LearnOperandMixin, OutputType
from ..utils import assert_all_finite
from .validation import check_array


def _unique_multiclass(y):
    if hasattr(y, "__array__") or hasattr(y, "__mars_tensor__"):
        return mt.unique(mt.asarray(y))
    else:
        return set(y)


def _unique_indicator(y):
    return mt.arange(check_array(y, accept_sparse=True).shape[1])


_FN_UNIQUE_LABELS = {
    "binary": _unique_multiclass,
    "multiclass": _unique_multiclass,
    "multilabel-indicator": _unique_indicator,
}


class UniqueLabels(LearnOperand, LearnOperandMixin):
    _op_type_ = OperandDef.UNIQUE_LABELS

    ys = ListField("ys")

    def __call__(self, ys: List[TileableType]):
        self._output_types = [OutputType.tensor]
        inputs = [y for y in ys if isinstance(y, TILEABLE_TYPE)]
        return self.new_tileable(
            inputs,
            shape=(np.nan,),
            dtype=mt.tensor(ys[0]).dtype,
            order=TensorOrder.C_ORDER,
        )

    @classmethod
    def tile(cls, op: "UniqueLabels"):
        ys = op.ys
        ctx = get_context()

        target_types = yield from recursive_tile([type_of_target(x) for x in ys])
        # yield chunks of target_types for execution
        chunks = list(itertools.chain(*(t.chunks for t in target_types)))
        yield chunks

        ys_types = set(
            [it.item() for it in ctx.get_chunks_result([c.key for c in chunks])]
        )
        if ys_types == {"binary", "multiclass"}:
            ys_types = {"multiclass"}

        if len(ys_types) > 1:
            raise ValueError("Mix type of y not allowed, got types %s" % ys_types)

        label_type = ys_types.pop()

        # Check consistency for the indicator format
        if label_type == "multilabel-indicator":
            check_arrays = []
            chunks = []
            for y in ys:
                arr = yield from recursive_tile(check_array(y, accept_sparse=True))
                check_arrays.append(arr)
                chunks.extend(arr.chunks)
            yield check_arrays + chunks
            if len(set(arr.shape[1] for arr in check_arrays)) > 1:
                raise ValueError(
                    "Multi-label binary indicator input with "
                    "different numbers of labels"
                )

        # Get the unique set of labels
        _unique_labels = _FN_UNIQUE_LABELS.get(label_type, None)
        if not _unique_labels:
            raise ValueError("Unknown label type: %s" % repr(ys))

        labels = [_unique_labels(y) for y in ys]
        labels_chunks = []
        ys_labels = set()
        for label in labels:
            if isinstance(label, ENTITY_TYPE):
                label = yield from recursive_tile(label)
                labels_chunks.extend(label.chunks)
            else:
                ys_labels.update(label)
        yield labels_chunks
        ys_labels.update(
            itertools.chain.from_iterable(
                ctx.get_chunks_result([c.key for c in labels_chunks])
            )
        )

        # Check that we don't mix string type with number type
        if len(set(isinstance(label, str) for label in ys_labels)) > 1:
            raise ValueError("Mix of label input types (string and number)")

        return (yield from recursive_tile(mt.array(sorted(ys_labels))))


def unique_labels(*ys):
    """
    Extract an ordered array of unique labels.

    We don't allow:
        - mix of multilabel and multiclass (single label) targets
        - mix of label indicator matrix and anything else,
          because there are no explicit labels)
        - mix of label indicator matrices of different sizes
        - mix of string and integer labels

    At the moment, we also don't allow "multiclass-multioutput" input type.

    Parameters
    ----------
    *ys : array-likes

    Returns
    -------
    out : ndarray of shape (n_unique_labels,)
        An ordered array of unique labels.

    Examples
    --------
    >>> from mars.learn.utils.multiclass import unique_labels
    >>> unique_labels([3, 5, 5, 5, 7, 7]).execute()
    array([3, 5, 7])
    >>> unique_labels([1, 2, 3, 4], [2, 2, 3, 4]).execute()
    array([1, 2, 3, 4])
    >>> unique_labels([1, 2, 10], [5, 11]).execute()
    array([ 1,  2,  5, 10, 11])
    """
    if not ys:
        raise ValueError("No argument has been passed.")

    ys = list(ys)
    op = UniqueLabels(ys=ys)
    return op(ys)


class IsMultilabel(LearnOperand, LearnOperandMixin):
    _op_type_ = OperandDef.IS_MULTILABEL

    y = AnyField("y")

    def __call__(self, y):
        self._output_types = [OutputType.tensor]
        inputs = [y] if isinstance(y, ENTITY_TYPE) else []
        return self.new_tileable(
            inputs, shape=(), dtype=np.dtype(bool), order=TensorOrder.C_ORDER
        )

    def _set_inputs(self, inputs):
        super()._set_inputs(inputs)
        if self._inputs:
            self.y = self._inputs[0]

    @classmethod
    def _tile(cls, op: "IsMultilabel"):
        y = op.y

        if not isinstance(y, ENTITY_TYPE):
            return sklearn_is_multilabel(y)

        ctx = get_context()

        if has_unknown_shape(y):  # pragma: no cover
            yield

        if not (hasattr(y, "shape") and y.ndim == 2 and y.shape[1] > 1):
            return False

        labels = yield from recursive_tile(mt.unique(y))
        yield labels.chunks + [labels]

        if len(labels) < 3:
            if y.dtype.kind in "biu":
                return True
            if y.dtype.kind == "f":
                is_integral_float = yield from recursive_tile(
                    mt.all(mt.equal(y.astype(int), y))
                )
                yield is_integral_float.chunks
                is_integral_float = ctx.get_chunks_result(
                    [is_integral_float.chunks[0].key]
                )[0]
                if is_integral_float:
                    return True

        return False

    @classmethod
    def tile(cls, op: "IsMultilabel"):
        result = yield from cls._tile(op)
        return (yield from recursive_tile(mt.array(result)))


[docs]def is_multilabel(y): """ Check if ``y`` is in a multilabel format. Parameters ---------- y : numpy array of shape [n_samples] Target values. Returns ------- out : bool, Return ``True``, if ``y`` is in a multilabel format, else ```False``. Examples -------- >>> import mars.tensor as mt >>> from mars.learn.utils.multiclass import is_multilabel >>> is_multilabel([0, 1, 0, 1]).execute() False >>> is_multilabel([[1], [0, 2], []]).execute() False >>> is_multilabel(mt.array([[1, 0], [0, 0]])).execute() True >>> is_multilabel(mt.array([[1], [0], [0]])).execute() False >>> is_multilabel(mt.array([[1, 0, 0]])).execute() True """ if not isinstance(y, ENTITY_TYPE): if hasattr(y, "__array__") or isinstance(y, Sequence): y = np.asarray(y) yt = None else: yt = y = mt.tensor(y) op = IsMultilabel(y=y) return op(yt)
class TypeOfTarget(LearnOperand, LearnOperandMixin): _op_type_ = OperandDef.TYPE_OF_TARGET y = AnyField("y") def __call__(self, y: TileableType): self._output_types = [OutputType.tensor] inputs = [y] if isinstance(y, ENTITY_TYPE) else [] return self.new_tileable( inputs, shape=(), order=TensorOrder.C_ORDER, dtype=np.dtype(object) ) def _set_inputs(self, inputs): super()._set_inputs(inputs) if self._inputs: self.y = self._inputs[0] @classmethod def _tile(cls, op: "TypeOfTarget"): y = op.y # y is ndarray if not isinstance(y, ENTITY_TYPE): return sklearn_type_of_target(y) else: # make sure y executed yield ctx = get_context() multilabel = yield from recursive_tile(is_multilabel(y)) yield multilabel.chunks multilabel = ctx.get_chunks_result([multilabel.chunks[0].key])[0] if multilabel: return "multilabel-indicator" # Invalid inputs if y.ndim > 2: return "unknown" if y.dtype == object and len(y): # [[[1, 2]]] or [obj_1] and not ["label_1"] first_val = ctx.get_chunks_result([y.chunks[0].key])[0].flat[0] if not isinstance(first_val, str): return "unknown" if y.ndim == 2 and y.shape[1] == 0: return "unknown" # [[]] if y.ndim == 2 and y.shape[1] > 1: suffix = "-multioutput" # [[1, 2], [1, 2]] else: suffix = "" # [1, 2, 3] or [[1], [2], [3]] if y.dtype.kind == "f": # check float and contains non-integer float values contain_float_values = yield from recursive_tile(mt.any(y != y.astype(int))) yield contain_float_values.chunks contain_float_values = ctx.get_chunks_result( [contain_float_values.chunks[0].key] )[0] # [.1, .2, 3] or [[.1, .2, 3]] or [[1., .2]] and not [1., 2., 3.] if contain_float_values: yield from recursive_tile(assert_all_finite(y)) return "continuous" + suffix unique_y = yield from recursive_tile(mt.unique(y)) yield unique_y.chunks + [unique_y] if (len(unique_y) > 2) or (y.ndim >= 2 and len(y[0]) > 1): return "multiclass" + suffix # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]] else: return "binary" # [1, 2] or [["a"], ["b"]] @classmethod def tile(cls, op: "TypeOfTarget"): result = yield from cls._tile(op) return (yield from recursive_tile(mt.array(result)))
[docs]def type_of_target(y): """ Determine the type of data indicated by the target. Note that this type is the most specific type that can be inferred. For example: * ``binary`` is more specific but compatible with ``multiclass``. * ``multiclass`` of integers is more specific but compatible with ``continuous``. * ``multilabel-indicator`` is more specific but compatible with ``multiclass-multioutput``. Parameters ---------- y : array-like Returns ------- target_type : string One of: * 'continuous': `y` is an array-like of floats that are not all integers, and is 1d or a column vector. * 'continuous-multioutput': `y` is a 2d tensor of floats that are not all integers, and both dimensions are of size > 1. * 'binary': `y` contains <= 2 discrete values and is 1d or a column vector. * 'multiclass': `y` contains more than two discrete values, is not a sequence of sequences, and is 1d or a column vector. * 'multiclass-multioutput': `y` is a 2d tensor that contains more than two discrete values, is not a sequence of sequences, and both dimensions are of size > 1. * 'multilabel-indicator': `y` is a label indicator matrix, a tensor of two dimensions with at least two columns, and at most 2 unique values. * 'unknown': `y` is array-like but none of the above, such as a 3d tensor, sequence of sequences, or a tensor of non-sequence objects. Examples -------- >>> import mars.tensor as mt >>> from mars.learn.utils.multiclass import type_of_target >>> type_of_target([0.1, 0.6]).execute() 'continuous' >>> type_of_target([1, -1, -1, 1]).execute() 'binary' >>> type_of_target(['a', 'b', 'a']).execute() 'binary' >>> type_of_target([1.0, 2.0]).execute() 'binary' >>> type_of_target([1, 0, 2]).execute() 'multiclass' >>> type_of_target([1.0, 0.0, 3.0]).execute() 'multiclass' >>> type_of_target(['a', 'b', 'c']).execute() 'multiclass' >>> type_of_target(mt.array([[1, 2], [3, 1]])).execute() 'multiclass-multioutput' >>> type_of_target([[1, 2]]).execute() 'multiclass-multioutput' >>> type_of_target(mt.array([[1.5, 2.0], [3.0, 1.6]])).execute() 'continuous-multioutput' >>> type_of_target(mt.array([[0, 1], [1, 1]])).execute() 'multilabel-indicator' """ if isinstance(y, TENSOR_TYPE): y = mt.tensor(y) valid_types = (Sequence, spmatrix) if spmatrix is not None else (Sequence,) valid = ( isinstance(y, valid_types) or hasattr(y, "__array__") or hasattr(y, "__mars_tensor__") ) and not isinstance(y, str) if not valid: raise ValueError(f"Expected array-like (array or non-string sequence), got {y}") sparse_pandas = y.__class__.__name__ in ["SparseSeries", "SparseArray"] if sparse_pandas: # pragma: no cover raise ValueError("y cannot be class 'SparseSeries' or 'SparseArray'") if isinstance(y, ENTITY_TYPE): y = mt.tensor(y) op = TypeOfTarget(y=y) return op(y)
def check_classification_targets(y): """ Ensure that target y is of a non-regression type. Only the following target types (as defined in type_of_target) are allowed: 'binary', 'multiclass', 'multiclass-multioutput', 'multilabel-indicator', 'multilabel-sequences' Parameters ---------- y : array-like """ y_type = type_of_target(y) def check(t): if t not in [ "binary", "multiclass", "multiclass-multioutput", "multilabel-indicator", "multilabel-sequences", ]: raise ValueError("Unknown label type: %r" % t) return t y_type = y_type.map_chunk(check, dtype=y_type.dtype) return y_type