Source code for mars.learn.metrics._classification

# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings

import numpy as np
from sklearn.exceptions import UndefinedMetricWarning

from ... import execute, fetch
from ... import opcodes as OperandDef
from ... import tensor as mt
from ...core import ENTITY_TYPE, recursive_tile
from ...core.context import get_context
from ...serialization.serializables import AnyField, BoolField, KeyField
from ...tensor.core import TensorOrder
from ..operands import LearnOperand, LearnOperandMixin, OutputType
from ..preprocessing import LabelBinarizer, LabelEncoder
from ..utils import check_array, check_consistent_length, column_or_1d
from ..utils.multiclass import unique_labels
from ..utils.sparsefuncs import count_nonzero
from ._check_targets import _check_targets


class AccuracyScore(LearnOperand, LearnOperandMixin):
    _op_type_ = OperandDef.ACCURACY_SCORE

    _y_true = AnyField("y_true")
    _y_pred = AnyField("y_pred")
    _normalize = BoolField("normalize")
    _sample_weight = AnyField("sample_weight")
    _type_true = KeyField("type_true")

    def __init__(
        self,
        y_true=None,
        y_pred=None,
        normalize=None,
        sample_weight=None,
        type_true=None,
        **kw
    ):
        super().__init__(
            _y_true=y_true,
            _y_pred=y_pred,
            _normalize=normalize,
            _sample_weight=sample_weight,
            _type_true=type_true,
            **kw
        )
        self.output_types = [OutputType.tensor]

    @property
    def y_true(self):
        return self._y_true

    @property
    def y_pred(self):
        return self._y_pred

    @property
    def normalize(self):
        return self._normalize

    @property
    def sample_weight(self):
        return self._sample_weight

    @property
    def type_true(self):
        return self._type_true

    def _set_inputs(self, inputs):
        super()._set_inputs(inputs)
        inputs_iter = iter(self._inputs)
        if self._y_true is not None:
            self._y_true = next(inputs_iter)
        if self._y_pred is not None:
            self._y_pred = next(inputs_iter)
        if self._type_true is not None:
            self._type_true = next(inputs_iter)
        if isinstance(self._sample_weight, ENTITY_TYPE):
            self._sample_weight = next(inputs_iter)

    def __call__(self, y_true, y_pred):
        type_true, y_true, y_pred = _check_targets(y_true, y_pred)
        self._type_true = type_true
        inputs = [y_true, y_pred, type_true]
        if isinstance(self._sample_weight, ENTITY_TYPE):
            inputs.append(self._sample_weight)

        dtype = np.dtype(float) if self._normalize else np.result_type(y_true, y_pred)
        return self.new_tileable(
            inputs, dtype=dtype, shape=(), order=TensorOrder.C_ORDER
        )

    @classmethod
    def tile(cls, op):
        # make sure type_true executed first
        chunks = [op.type_true.chunks[0]]
        yield chunks

        ctx = get_context()
        type_true = ctx.get_chunks_result([chunks[0].key])[0]

        y_true, y_pred = op.y_true, op.y_pred
        if type_true.item().startswith("multilabel"):
            differing_labels = mt.count_nonzero(y_true - y_pred, axis=1)
            score = mt.equal(differing_labels, 0)
        else:
            score = mt.equal(y_true, y_pred)

        result = _weighted_sum(score, op.sample_weight, op.normalize)
        return [(yield from recursive_tile(result))]


def _weighted_sum(sample_score, sample_weight, normalize=False):
    if normalize:
        return mt.average(sample_score, weights=sample_weight)
    elif sample_weight is not None:
        return mt.dot(sample_score, sample_weight)
    else:
        return sample_score.sum()


[docs]def accuracy_score( y_true, y_pred, normalize=True, sample_weight=None, session=None, run_kwargs=None ): """Accuracy classification score. In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must *exactly* match the corresponding set of labels in y_true. Read more in the :ref:`User Guide <accuracy_score>`. Parameters ---------- y_true : 1d array-like, or label indicator tensor / sparse tensor Ground truth (correct) labels. y_pred : 1d array-like, or label indicator tensor / sparse tensor Predicted labels, as returned by a classifier. normalize : bool, optional (default=True) If ``False``, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples. sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- score : float If ``normalize == True``, return the fraction of correctly classified samples (float), else returns the number of correctly classified samples (int). The best performance is 1 with ``normalize == True`` and the number of samples with ``normalize == False``. See also -------- jaccard_score, hamming_loss, zero_one_loss Notes ----- In binary and multiclass classification, this function is equal to the ``jaccard_score`` function. Examples -------- >>> from mars.learn.metrics import accuracy_score >>> y_pred = [0, 2, 1, 3] >>> y_true = [0, 1, 2, 3] >>> accuracy_score(y_true, y_pred).execute() 0.5 >>> accuracy_score(y_true, y_pred, normalize=False).execute() 2 In the multilabel case with binary label indicators: >>> import mars.tensor as mt >>> accuracy_score(mt.array([[0, 1], [1, 1]]), mt.ones((2, 2))).execute() 0.5 """ # Compute accuracy for each possible representation op = AccuracyScore( y_true=y_true, y_pred=y_pred, normalize=normalize, sample_weight=sample_weight ) score = op(y_true, y_pred) return score.execute(session=session, **(run_kwargs or dict()))
[docs]def log_loss( y_true, y_pred, *, eps=1e-15, normalize=True, sample_weight=None, labels=None ): r"""Log loss, aka logistic loss or cross-entropy loss. This is the loss function used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of a logistic model that returns ``y_pred`` probabilities for its training data ``y_true``. The log loss is only defined for two or more labels. For a single sample with true label :math:`y \in \{0,1\}` and and a probability estimate :math:`p = \operatorname{Pr}(y = 1)`, the log loss is: .. math:: L_{\log}(y, p) = -(y \log (p) + (1 - y) \log (1 - p)) Read more in the :ref:`User Guide <log_loss>`. Parameters ---------- y_true : array-like or label indicator matrix Ground truth (correct) labels for n_samples samples. y_pred : array-like of float, shape = (n_samples, n_classes) or (n_samples,) Predicted probabilities, as returned by a classifier's predict_proba method. If ``y_pred.shape = (n_samples,)`` the probabilities provided are assumed to be that of the positive class. The labels in ``y_pred`` are assumed to be ordered alphabetically, as done by :class:`preprocessing.LabelBinarizer`. eps : float, default=1e-15 Log loss is undefined for p=0 or p=1, so probabilities are clipped to max(eps, min(1 - eps, p)). normalize : bool, default=True If true, return the mean loss per sample. Otherwise, return the sum of the per-sample losses. sample_weight : array-like of shape (n_samples,), default=None Sample weights. labels : array-like, default=None If not provided, labels will be inferred from y_true. If ``labels`` is ``None`` and ``y_pred`` has shape (n_samples,) the labels are assumed to be binary and are inferred from ``y_true``. Returns ------- loss : float Notes ----- The logarithm used is the natural logarithm (base-e). Examples -------- >>> from mars.learn.metrics import log_loss >>> log_loss(["spam", "ham", "ham", "spam"], ... [[.1, .9], [.9, .1], [.8, .2], [.35, .65]]) 0.21616... References ---------- C.M. Bishop (2006). Pattern Recognition and Machine Learning. Springer, p. 209. """ y_pred = check_array(y_pred, ensure_2d=False) check_consistent_length(y_pred, y_true, sample_weight) lb = LabelBinarizer() if labels is not None: lb.fit(labels) else: lb.fit(y_true) if len(lb.classes_) == 1: if labels is None: raise ValueError( "y_true contains only one label ({0}). Please " "provide the true labels explicitly through the " "labels argument.".format(lb.classes_[0].fetch()) ) else: raise ValueError( "The labels array needs to contain at least two " "labels for log_loss, " "got {0}.".format(lb.classes_.fetch()) ) transformed_labels = lb.transform(y_true) if transformed_labels.shape[1] == 1: transformed_labels = mt.append( 1 - transformed_labels, transformed_labels, axis=1 ) # Clipping y_pred = mt.clip(y_pred, eps, 1 - eps) # If y_pred is of single dimension, assume y_true to be binary # and then check. if y_pred.ndim == 1: # pragma: no cover y_pred = y_pred[:, mt.newaxis] if y_pred.shape[1] == 1: # pragma: no cover y_pred = mt.append(1 - y_pred, y_pred, axis=1) # Check if dimensions are consistent. transformed_labels = check_array(transformed_labels) if len(lb.classes_) != y_pred.shape[1]: if labels is None: raise ValueError( "y_true and y_pred contain different number of " "classes {0}, {1}. Please provide the true " "labels explicitly through the labels argument. " "Classes found in " "y_true: {2}".format( transformed_labels.shape[1], y_pred.shape[1], lb.classes_.fetch() ) ) else: raise ValueError( "The number of classes in labels is different " "from that in y_pred. Classes found in " "labels: {0}".format(lb.classes_.fetch()) ) # Renormalize y_pred /= y_pred.sum(axis=1)[:, mt.newaxis] loss = -(transformed_labels * mt.log(y_pred)).sum(axis=1) return _weighted_sum(loss, sample_weight, normalize).execute()
[docs]def multilabel_confusion_matrix( y_true, y_pred, *, sample_weight=None, labels=None, samplewise=False, session=None, run_kwargs=None ): """ Compute a confusion matrix for each class or sample. Compute class-wise (default) or sample-wise (samplewise=True) multilabel confusion matrix to evaluate the accuracy of a classification, and output confusion matrices for each class or sample. In multilabel confusion matrix :math:`MCM`, the count of true negatives is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`, true positives is :math:`MCM_{:,1,1}` and false positives is :math:`MCM_{:,0,1}`. Multiclass data will be treated as if binarized under a one-vs-rest transformation. Returned confusion matrices will be in the order of sorted unique labels in the union of (y_true, y_pred). Read more in the :ref:`User Guide <multilabel_confusion_matrix>`. Parameters ---------- y_true : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ (n_samples,) Ground truth (correct) target values. y_pred : {array-like, sparse matrix} of shape (n_samples, n_outputs) or \ (n_samples,) Estimated targets as returned by a classifier. sample_weight : array-like of shape (n_samples,), default=None Sample weights. labels : array-like of shape (n_classes,), default=None A list of classes or column indices to select some (or to force inclusion of classes absent from the data). samplewise : bool, default=False In the multilabel case, this calculates a confusion matrix per sample. Returns ------- multi_confusion : ndarray of shape (n_outputs, 2, 2) A 2x2 confusion matrix corresponding to each output in the input. When calculating class-wise multi_confusion (default), then n_outputs = n_labels; when calculating sample-wise multi_confusion (samplewise=True), n_outputs = n_samples. If ``labels`` is defined, the results will be returned in the order specified in ``labels``, otherwise the results will be returned in sorted order by default. See Also -------- confusion_matrix : Compute confusion matrix to evaluate the accuracy of a classifier. Notes ----- The `multilabel_confusion_matrix` calculates class-wise or sample-wise multilabel confusion matrices, and in multiclass tasks, labels are binarized under a one-vs-rest way; while :func:`~sklearn.metrics.confusion_matrix` calculates one confusion matrix for confusion between every two classes. Examples -------- Multiclass case: >>> import mars.tensor as mt >>> from mars.learn.metrics import multilabel_confusion_matrix >>> y_true = ["cat", "ant", "cat", "cat", "ant", "bird"] >>> y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"] >>> multilabel_confusion_matrix(y_true, y_pred, ... labels=["ant", "bird", "cat"]) array([[[3, 1], [0, 2]], <BLANKLINE> [[5, 0], [1, 0]], <BLANKLINE> [[2, 1], [1, 2]]]) Multilabel-indicator case not implemented yet. """ exec_kw = dict(session=session, **(run_kwargs or dict())) y_type, y_true, y_pred = _check_targets(y_true, y_pred) execute(y_type, y_true, y_pred, **exec_kw) y_type = y_type.fetch() if sample_weight is not None: sample_weight = column_or_1d(sample_weight) check_consistent_length(y_true, y_pred, sample_weight, **exec_kw) if y_type not in ("binary", "multiclass", "multilabel-indicator"): raise ValueError("%s is not supported" % y_type) present_labels = unique_labels(y_true, y_pred) if labels is None: labels = present_labels n_labels = None else: labels = mt.tensor(labels) n_labels = labels.shape[0] # todo simplify this when mt.setdiff1d is implemented. labels = labels.rechunk(((np.nan,),)).map_chunk( lambda l, pl: np.hstack([l, np.setdiff1d(pl, l, assume_unique=True)]), args=(present_labels,), dtype=labels.dtype, shape=(np.nan,), ) if y_true.ndim == 1: if samplewise: raise ValueError( "Samplewise metrics are not available outside of " "multilabel classification." ) le = LabelEncoder() le.fit(labels, execute=False) y_true = le.transform(y_true, execute=False) y_pred = le.transform(y_pred, execute=False) sorted_labels = le.classes_ # labels are now from 0 to len(labels) - 1 -> use bincount tp = y_true == y_pred tp_bins = y_true[tp] execute(labels, y_true, y_pred, tp_bins, **exec_kw) if sample_weight is not None: tp_bins_weights = mt.asarray(sample_weight)[tp] else: tp_bins_weights = None if tp_bins.shape[0]: tp_sum = mt.bincount( tp_bins, weights=tp_bins_weights, minlength=labels.shape[0] ) else: # Pathological case true_sum = pred_sum = tp_sum = mt.zeros(labels.shape[0]) if y_pred.shape[0]: pred_sum = mt.bincount( y_pred, weights=sample_weight, minlength=labels.shape[0] ) if y_true.shape[0]: true_sum = mt.bincount( y_true, weights=sample_weight, minlength=labels.shape[0] ) # Retain only selected labels indices = mt.searchsorted(sorted_labels, labels[:n_labels]) tp_sum = tp_sum[indices] true_sum = true_sum[indices] pred_sum = pred_sum[indices] else: sum_axis = 1 if samplewise else 0 def _check_labels(labels, present_labels): # All labels are index integers for multilabel. # Select labels: if not np.array_equal(labels, present_labels): if np.max(labels) > np.max(present_labels): raise ValueError( "All labels must be in [0, n labels) for " "multilabel targets. " "Got %d > %d" % (np.max(labels), np.max(present_labels)) ) if np.min(labels) < 0: raise ValueError( "All labels must be in [0, n labels) for " "multilabel targets. " "Got %d < 0" % np.min(labels) ) return labels labels = labels.map_chunk( _check_labels, args=(present_labels,), dtype=labels.dtype, shape=labels.shape, ) if n_labels is not None: y_true = y_true[:, labels[:n_labels]] y_pred = y_pred[:, labels[:n_labels]] # calculate weighted counts true_and_pred = mt.multiply(y_true, y_pred) tp_sum = count_nonzero( true_and_pred, axis=sum_axis, sample_weight=sample_weight ) pred_sum = count_nonzero(y_pred, axis=sum_axis, sample_weight=sample_weight) true_sum = count_nonzero(y_true, axis=sum_axis, sample_weight=sample_weight) fp = pred_sum - tp_sum fn = true_sum - tp_sum tp = tp_sum # we need to obtain correct shape of y_true for further computation executables = (fp, fn, tp, y_true) execute(*executables, **exec_kw) if sample_weight is not None and samplewise: sample_weight = mt.asarray(sample_weight) tp = mt.asarray(tp) fp = mt.asarray(fp) fn = mt.asarray(fn) tn = sample_weight * y_true.shape[1] - tp - fp - fn elif sample_weight is not None: tn = sum(sample_weight) - tp - fp - fn elif samplewise: tn = y_true.shape[1] - tp - fp - fn else: tn = y_true.shape[0] - tp - fp - fn ret = mt.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2) return ret.execute(**exec_kw)
def _check_zero_division(zero_division): # pragma: no cover if isinstance(zero_division, str) and zero_division == "warn": return elif isinstance(zero_division, (int, float)) and zero_division in [0, 1]: return raise ValueError( "Got zero_division={0}." ' Must be one of ["warn", 0, 1]'.format(zero_division) ) def _warn_prf(average, modifier, msg_start, result_size): # pragma: no cover axis0, axis1 = "sample", "label" if average == "samples": axis0, axis1 = axis1, axis0 msg = ( "{0} ill-defined and being set to 0.0 {{0}} " "no {1} {2}s. Use `zero_division` parameter to control" " this behavior.".format(msg_start, modifier, axis0) ) if result_size == 1: msg = msg.format("due to") else: msg = msg.format("in {0}s with".format(axis1)) warnings.warn(msg, UndefinedMetricWarning, stacklevel=2) def _prf_divide( numerator, denominator, metric, modifier, average, warn_for, zero_division="warn" ): # pragma: no cover """Performs division and handles divide-by-zero. On zero-division, sets the corresponding result elements equal to 0 or 1 (according to ``zero_division``). Plus, if ``zero_division != "warn"`` raises a warning. The metric, modifier and average arguments are used only for determining an appropriate warning. """ mask = denominator == 0.0 denominator = denominator.copy() denominator[mask] = 1 # avoid infs/nans result = numerator / denominator # if ``zero_division=1``, set those with denominator == 0 equal to 1 result[mask] = 0.0 if zero_division in ["warn", 0] else 1.0 # the user will be removing warnings if zero_division is set to something # different than its default value. If we are computing only f-score # the warning will be raised only if precision and recall are ill-defined if zero_division != "warn" or metric not in warn_for: return result # build appropriate warning # E.g. "Precision and F-score are ill-defined and being set to 0.0 in # labels with no predicted samples. Use ``zero_division`` parameter to # control this behavior." if metric in warn_for and "f-score" in warn_for: msg_start = "{0} and F-score are".format(metric.title()) elif metric in warn_for: msg_start = "{0} is".format(metric.title()) elif "f-score" in warn_for: msg_start = "F-score is" else: return result _warn_prf(average, modifier, msg_start, len(result)) return result def _check_set_wise_labels( y_true, y_pred, average, labels, pos_label, session=None, run_kwargs=None ): # pragma: no cover """Validation associated with set-wise metrics Returns identified labels """ exec_kwargs = dict(session=session, **(run_kwargs or dict())) average_options = (None, "micro", "macro", "weighted", "samples") if average not in average_options and average != "binary": raise ValueError("average has to be one of " + str(average_options)) y_type, y_true, y_pred = _check_targets(y_true, y_pred) present_labels = unique_labels(y_true, y_pred) execute(y_type, y_true, y_pred, **exec_kwargs) y_type = y_type.fetch(**exec_kwargs) if average == "binary": if y_type == "binary": t_pos_in_labels = mt.any(mt.isin(present_labels, pos_label)) execute(t_pos_in_labels, present_labels, **exec_kwargs) pos_in_labels = t_pos_in_labels.fetch(**exec_kwargs) if pos_in_labels: if present_labels.shape[0] >= 2: raise ValueError( "pos_label=%r is not a valid label: " "%r" % (pos_label, present_labels) ) labels = [pos_label] else: average_options = list(average_options) if y_type == "multiclass": average_options.remove("samples") raise ValueError( "Target is %s but average='binary'. Please " "choose another average setting, one of %r." % (y_type, average_options) ) elif pos_label not in (None, 1): warnings.warn( "Note that pos_label (set to %r) is ignored when " "average != 'binary' (got %r). You may use " "labels=[pos_label] to specify a single positive class." % (pos_label, average), UserWarning, ) return labels
[docs]def precision_recall_fscore_support( y_true, y_pred, *, beta=1.0, labels=None, pos_label=1, average=None, warn_for=("precision", "recall", "f-score"), sample_weight=None, zero_division="warn", session=None, run_kwargs=None ): """Compute precision, recall, F-measure and support for each class The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The F-beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F-beta score reaches its best value at 1 and worst score at 0. The F-beta score weights recall more than precision by a factor of ``beta``. ``beta == 1.0`` means recall and precision are equally important. The support is the number of occurrences of each class in ``y_true``. If ``pos_label is None`` and in binary classification, this function returns the average precision, recall and F-measure if ``average`` is one of ``'micro'``, ``'macro'``, ``'weighted'`` or ``'samples'``. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. beta : float, 1.0 by default The strength of recall versus precision in the F-score. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, 1 by default The class to report if ``average='binary'`` and the data is binary. If the data are multiclass or multilabel, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : string, [None (default), 'binary', 'micro', 'macro', 'samples', \ 'weighted'] If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). warn_for : tuple or set, for internal use This determines which warnings will be made in the case that this function is being used to return only one of its metrics. sample_weight : array-like of shape (n_samples,), default=None Sample weights. zero_division : "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division: - recall: when there are no positive labels - precision: when there are no positive predictions - f-score: both If set to "warn", this acts as 0, but warnings are also raised. Returns ------- precision : float (if average is not None) or array of float, shape =\ [n_unique_labels] recall : float (if average is not None) or array of float, , shape =\ [n_unique_labels] fbeta_score : float (if average is not None) or array of float, shape =\ [n_unique_labels] support : None (if average is not None) or array of int, shape =\ [n_unique_labels] The number of occurrences of each label in ``y_true``. References ---------- .. [1] `Wikipedia entry for the Precision and recall <https://en.wikipedia.org/wiki/Precision_and_recall>`_ .. [2] `Wikipedia entry for the F1-score <https://en.wikipedia.org/wiki/F1_score>`_ .. [3] `Discriminative Methods for Multi-labeled Classification Advances in Knowledge Discovery and Data Mining (2004), pp. 22-30 by Shantanu Godbole, Sunita Sarawagi <http://www.godbole.net/shantanu/pubs/multilabelsvm-pakdd04.pdf>`_ Examples -------- >>> import numpy as np >>> from mars.learn.metrics import precision_recall_fscore_support >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig']) >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog']) >>> precision_recall_fscore_support(y_true, y_pred, average='macro') (0.22..., 0.33..., 0.26..., None) >>> precision_recall_fscore_support(y_true, y_pred, average='micro') (0.33..., 0.33..., 0.33..., None) >>> precision_recall_fscore_support(y_true, y_pred, average='weighted') (0.22..., 0.33..., 0.26..., None) It is possible to compute per-label precisions, recalls, F1-scores and supports instead of averaging: >>> precision_recall_fscore_support(y_true, y_pred, average=None, ... labels=['pig', 'dog', 'cat']) (array([0. , 0. , 0.66...]), array([0., 0., 1.]), array([0. , 0. , 0.8]), array([2, 2, 2])) Notes ----- When ``true positive + false positive == 0``, precision is undefined; When ``true positive + false negative == 0``, recall is undefined. In such cases, by default the metric will be set to 0, as will f-score, and ``UndefinedMetricWarning`` will be raised. This behavior can be modified with ``zero_division``. """ exec_kw = dict(session=session, **(run_kwargs or dict())) _check_zero_division(zero_division) if beta < 0: raise ValueError("beta should be >=0 in the F-beta score") labels = _check_set_wise_labels( y_true, y_pred, average, labels, pos_label, session=session, run_kwargs=run_kwargs, ) # Calculate tp_sum, pred_sum, true_sum ### samplewise = average == "samples" MCM = multilabel_confusion_matrix( y_true, y_pred, sample_weight=sample_weight, labels=labels, samplewise=samplewise, session=session, run_kwargs=run_kwargs, ) tp_sum = MCM[:, 1, 1] pred_sum = tp_sum + MCM[:, 0, 1] true_sum = tp_sum + MCM[:, 1, 0] if average == "micro": tp_sum = mt.array([tp_sum.sum()]) pred_sum = mt.array([pred_sum.sum()]) true_sum = mt.array([true_sum.sum()]) execute(true_sum, **exec_kw) # Finally, we have all our sufficient statistics. Divide! # beta2 = beta**2 # Divide, and on zero-division, set scores and/or warn according to # zero_division: precision = _prf_divide( tp_sum, pred_sum, "precision", "predicted", average, warn_for, zero_division ) recall = _prf_divide( tp_sum, true_sum, "recall", "true", average, warn_for, zero_division ) # warn for f-score only if zero_division is warn, it is in warn_for # and BOTH prec and rec are ill-defined if zero_division == "warn" and ("f-score",) == warn_for: any_pred_sum_zero = ( (pred_sum[true_sum == 0] == 0).any().execute(**exec_kw).fetch(**exec_kw) ) if any_pred_sum_zero: _warn_prf(average, "true nor predicted", "F-score is", len(true_sum)) # if tp == 0 F will be 1 only if all predictions are zero, all labels are # zero, and zero_division=1. In all other case, 0 if np.isposinf(beta): f_score = recall else: denom = beta2 * precision + recall denom[denom == 0.0] = 1 # avoid division by 0 f_score = (1 + beta2) * precision * recall / denom # Average the results if average == "weighted": weights = true_sum sum_weights, sum_pred_sum = fetch( execute(weights.sum(), pred_sum.sum(), **exec_kw), **exec_kw ) if sum_weights == 0: zero_division_value = 0.0 if zero_division in ["warn", 0] else 1.0 # precision is zero_division if there are no positive predictions # recall is zero_division if there are no positive labels # fscore is zero_division if all labels AND predictions are # negative return ( mt.scalar(zero_division_value if sum_pred_sum == 0 else 0), mt.scalar(zero_division_value), mt.scalar(zero_division_value if sum_pred_sum == 0 else 0), None, ) elif average == "samples": weights = sample_weight else: weights = None if average is not None: assert average != "binary" or len(precision) == 1 precision = mt.average(precision, weights=weights) recall = mt.average(recall, weights=weights) f_score = mt.average(f_score, weights=weights) true_sum = None # return no support return precision, recall, f_score, true_sum
[docs]def precision_score( y_true, y_pred, *, labels=None, pos_label=1, average="binary", sample_weight=None, zero_division="warn" ): """Compute the precision The precision is the ratio ``tp / (tp + fp)`` where ``tp`` is the number of true positives and ``fp`` the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, 1 by default The class to report if ``average='binary'`` and the data is binary. If the data are multiclass or multilabel, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : array-like of shape (n_samples,), default=None Sample weights. zero_division : "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division. If set to "warn", this acts as 0, but warnings are also raised. Returns ------- precision : float (if average is not None) or array of float, shape =\ [n_unique_labels] Precision of the positive class in binary classification or weighted average of the precision of each class for the multiclass task. See also -------- precision_recall_fscore_support, multilabel_confusion_matrix Examples -------- >>> from mars.learn.metrics import precision_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> precision_score(y_true, y_pred, average='macro') 0.22... >>> precision_score(y_true, y_pred, average='micro') 0.33... >>> precision_score(y_true, y_pred, average='weighted') 0.22... >>> precision_score(y_true, y_pred, average=None) array([0.66..., 0. , 0. ]) >>> y_pred = [0, 0, 0, 0, 0, 0] >>> precision_score(y_true, y_pred, average=None) array([0.33..., 0. , 0. ]) >>> precision_score(y_true, y_pred, average=None, zero_division=1) array([0.33..., 1. , 1. ]) Notes ----- When ``true positive + false positive == 0``, precision returns 0 and raises ``UndefinedMetricWarning``. This behavior can be modified with ``zero_division``. """ p, _, _, _ = precision_recall_fscore_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("precision",), sample_weight=sample_weight, zero_division=zero_division, ) return p
[docs]def recall_score( y_true, y_pred, *, labels=None, pos_label=1, average="binary", sample_weight=None, zero_division="warn" ): """Compute the recall The recall is the ratio ``tp / (tp + fn)`` where ``tp`` is the number of true positives and ``fn`` the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples. The best value is 1 and the worst value is 0. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, 1 by default The class to report if ``average='binary'`` and the data is binary. If the data are multiclass or multilabel, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : array-like of shape (n_samples,), default=None Sample weights. zero_division : "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division. If set to "warn", this acts as 0, but warnings are also raised. Returns ------- recall : float (if average is not None) or array of float, shape =\ [n_unique_labels] Recall of the positive class in binary classification or weighted average of the recall of each class for the multiclass task. See also -------- precision_recall_fscore_support, balanced_accuracy_score, multilabel_confusion_matrix Examples -------- >>> from mars.learn.metrics import recall_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> recall_score(y_true, y_pred, average='macro') 0.33... >>> recall_score(y_true, y_pred, average='micro') 0.33... >>> recall_score(y_true, y_pred, average='weighted') 0.33... >>> recall_score(y_true, y_pred, average=None) array([1., 0., 0.]) >>> y_true = [0, 0, 0, 0, 0, 0] >>> recall_score(y_true, y_pred, average=None) array([0.5, 0. , 0. ]) >>> recall_score(y_true, y_pred, average=None, zero_division=1) array([0.5, 1. , 1. ]) Notes ----- When ``true positive + false negative == 0``, recall returns 0 and raises ``UndefinedMetricWarning``. This behavior can be modified with ``zero_division``. """ _, r, _, _ = precision_recall_fscore_support( y_true, y_pred, labels=labels, pos_label=pos_label, average=average, warn_for=("recall",), sample_weight=sample_weight, zero_division=zero_division, ) return r
[docs]def f1_score( y_true, y_pred, *, labels=None, pos_label=1, average="binary", sample_weight=None, zero_division="warn" ): """Compute the F1 score, also known as balanced F-score or F-measure The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst score at 0. The relative contribution of precision and recall to the F1 score are equal. The formula for the F1 score is:: F1 = 2 * (precision * recall) / (precision + recall) In the multi-class and multi-label case, this is the average of the F1 score of each class with weighting depending on the ``average`` parameter. Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, 1 by default The class to report if ``average='binary'`` and the data is binary. If the data are multiclass or multilabel, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : array-like of shape (n_samples,), default=None Sample weights. zero_division : "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division, i.e. when all predictions and labels are negative. If set to "warn", this acts as 0, but warnings are also raised. Returns ------- f1_score : float or array of float, shape = [n_unique_labels] F1 score of the positive class in binary classification or weighted average of the F1 scores of each class for the multiclass task. See also -------- fbeta_score, precision_recall_fscore_support, jaccard_score, multilabel_confusion_matrix References ---------- .. [1] `Wikipedia entry for the F1-score <https://en.wikipedia.org/wiki/F1_score>`_ Examples -------- >>> from mars.learn.metrics import f1_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> f1_score(y_true, y_pred, average='macro') 0.26... >>> f1_score(y_true, y_pred, average='micro') 0.33... >>> f1_score(y_true, y_pred, average='weighted') 0.26... >>> f1_score(y_true, y_pred, average=None) array([0.8, 0. , 0. ]) >>> y_true = [0, 0, 0, 0, 0, 0] >>> y_pred = [0, 0, 0, 0, 0, 0] >>> f1_score(y_true, y_pred, zero_division=1) 1.0... Notes ----- When ``true positive + false positive == 0``, precision is undefined; When ``true positive + false negative == 0``, recall is undefined. In such cases, by default the metric will be set to 0, as will f-score, and ``UndefinedMetricWarning`` will be raised. This behavior can be modified with ``zero_division``. """ return fbeta_score( y_true, y_pred, beta=1, labels=labels, pos_label=pos_label, average=average, sample_weight=sample_weight, zero_division=zero_division, )
[docs]def fbeta_score( y_true, y_pred, *, beta, labels=None, pos_label=1, average="binary", sample_weight=None, zero_division="warn" ): """Compute the F-beta score The F-beta score is the weighted harmonic mean of precision and recall, reaching its optimal value at 1 and its worst value at 0. The `beta` parameter determines the weight of recall in the combined score. ``beta < 1`` lends more weight to precision, while ``beta > 1`` favors recall (``beta -> 0`` considers only precision, ``beta -> +inf`` only recall). Read more in the :ref:`User Guide <precision_recall_f_measure_metrics>`. Parameters ---------- y_true : 1d array-like, or label indicator array / sparse matrix Ground truth (correct) target values. y_pred : 1d array-like, or label indicator array / sparse matrix Estimated targets as returned by a classifier. beta : float Determines the weight of recall in the combined score. labels : list, optional The set of labels to include when ``average != 'binary'``, and their order if ``average is None``. Labels present in the data can be excluded, for example to calculate a multiclass average ignoring a majority negative class, while labels not present in the data will result in 0 components in a macro average. For multilabel targets, labels are column indices. By default, all labels in ``y_true`` and ``y_pred`` are used in sorted order. pos_label : str or int, 1 by default The class to report if ``average='binary'`` and the data is binary. If the data are multiclass or multilabel, this will be ignored; setting ``labels=[pos_label]`` and ``average != 'binary'`` will report scores for that label only. average : string, [None, 'binary' (default), 'micro', 'macro', 'samples', \ 'weighted'] This parameter is required for multiclass/multilabel targets. If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: ``'binary'``: Only report results for the class specified by ``pos_label``. This is applicable only if targets (``y_{true,pred}``) are binary. ``'micro'``: Calculate metrics globally by counting the total true positives, false negatives and false positives. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label). This alters 'macro' to account for label imbalance; it can result in an F-score that is not between precision and recall. ``'samples'``: Calculate metrics for each instance, and find their average (only meaningful for multilabel classification where this differs from :func:`accuracy_score`). sample_weight : array-like of shape (n_samples,), default=None Sample weights. zero_division : "warn", 0 or 1, default="warn" Sets the value to return when there is a zero division, i.e. when all predictions and labels are negative. If set to "warn", this acts as 0, but warnings are also raised. Returns ------- fbeta_score : float (if average is not None) or array of float, shape =\ [n_unique_labels] F-beta score of the positive class in binary classification or weighted average of the F-beta score of each class for the multiclass task. See also -------- precision_recall_fscore_support, multilabel_confusion_matrix References ---------- .. [1] R. Baeza-Yates and B. Ribeiro-Neto (2011). Modern Information Retrieval. Addison Wesley, pp. 327-328. .. [2] `Wikipedia entry for the F1-score <https://en.wikipedia.org/wiki/F1_score>`_ Examples -------- >>> from mars.learn.metrics import fbeta_score >>> y_true = [0, 1, 2, 0, 1, 2] >>> y_pred = [0, 2, 1, 0, 0, 1] >>> fbeta_score(y_true, y_pred, average='macro', beta=0.5) 0.23... >>> fbeta_score(y_true, y_pred, average='micro', beta=0.5) 0.33... >>> fbeta_score(y_true, y_pred, average='weighted', beta=0.5) 0.23... >>> fbeta_score(y_true, y_pred, average=None, beta=0.5) array([0.71..., 0. , 0. ]) Notes ----- When ``true positive + false positive == 0`` or ``true positive + false negative == 0``, f-score returns 0 and raises ``UndefinedMetricWarning``. This behavior can be modified with ``zero_division``. """ _, _, f, _ = precision_recall_fscore_support( y_true, y_pred, beta=beta, labels=labels, pos_label=pos_label, average=average, warn_for=("f-score",), sample_weight=sample_weight, zero_division=zero_division, ) return f