Source code for mars.learn.metrics._ranking

# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import warnings
from functools import partial

import numpy as np

from ... import execute as _execute, fetch as _fetch
from ... import tensor as mt
from ...utils import cache_tileables
from ..preprocessing import label_binarize
from ..utils._encode import _encode, _unique
from ..utils.checks import assert_all_finite
from ..utils.core import sort_by
from ..utils.multiclass import type_of_target
from ..utils.validation import check_array, check_consistent_length, column_or_1d
from ._base import _average_binary_score, _average_multiclass_ovo_score


[docs]def auc(x, y, session=None, run_kwargs=None): """Compute Area Under the Curve (AUC) using the trapezoidal rule This is a general function, given points on a curve. For computing the area under the ROC-curve, see :func:`roc_auc_score`. For an alternative way to summarize a precision-recall curve, see :func:`average_precision_score`. Parameters ---------- x : tensor, shape = [n] x coordinates. These must be either monotonic increasing or monotonic decreasing. y : tensor, shape = [n] y coordinates. Returns ------- auc : tensor, with float value Examples -------- >>> import mars.tensor as mt >>> from mars.learn import metrics >>> y = mt.array([1, 1, 2, 2]) >>> pred = mt.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2) >>> metrics.auc(fpr, tpr) 0.75 See also -------- roc_auc_score : Compute the area under the ROC curve average_precision_score : Compute average precision from prediction scores precision_recall_curve : Compute precision-recall pairs for different probability thresholds """ check_consistent_length(x, y) x = column_or_1d(x) y = column_or_1d(y) if x.shape[0] < 2: raise ValueError( "At least 2 points are needed to compute" f" area under curve, but x.shape = {x.shape}" ) direction = 1 dx = mt.diff(x) any_dx_lt_0 = mt.any(dx < 0) all_dx_le_0 = mt.all(dx <= 0) mt.ExecutableTuple([x, any_dx_lt_0, all_dx_le_0]).execute( session=session, **(run_kwargs or dict()) ) if any_dx_lt_0.fetch(session=session): if all_dx_le_0.fetch(session=session): direction = -1 else: x_data = x.fetch(session=session) raise ValueError(f"x is neither increasing nor decreasing : {x_data}.") area = direction * mt.trapz(y, x) return area.execute(session=session, **(run_kwargs or dict()))
def _binary_clf_curve( y_true, y_score, pos_label=None, sample_weight=None, session=None, run_kwargs=None ): """Calculate true and false positives per binary classification threshold. Parameters ---------- y_true : tensor, shape = [n_samples] True targets of binary classification y_score : tensor, shape = [n_samples] Estimated probabilities or decision function pos_label : int or str, default=None The label of the positive class sample_weight : array-like of shape (n_samples,), default=None Sample weights. Returns ------- fps : tensor, shape = [n_thresholds] A count of false positives, at index i being the number of negative samples assigned a score >= thresholds[i]. The total number of negative samples is equal to fps[-1] (thus true negatives are given by fps[-1] - fps). tps : tensor, shape = [n_thresholds <= len(mt.unique(y_score))] An increasing count of true positives, at index i being the number of positive samples assigned a score >= thresholds[i]. The total number of positive samples is equal to tps[-1] (thus false negatives are given by tps[-1] - tps). thresholds : tensor, shape = [n_thresholds] Decreasing score values. """ y_type = type_of_target(y_true).to_numpy(session=session, **(run_kwargs or dict())) if not (y_type == "binary" or (y_type == "multiclass" and pos_label is not None)): raise ValueError(f"{y_type} format is not supported") check_consistent_length( y_true, y_score, sample_weight, session=session, **(run_kwargs or dict()) ) y_true = column_or_1d(y_true) y_score = column_or_1d(y_score) y_true = assert_all_finite(y_true, check_only=False) y_score = assert_all_finite(y_score, check_only=False) cache_tileables(y_true, y_score) if sample_weight is not None: sample_weight = column_or_1d(sample_weight) # ensure binary classification if pos_label is not specified # classes.dtype.kind in ('O', 'U', 'S') is required to avoid # triggering a FutureWarning by calling np.array_equal(a, b) # when elements in the two arrays are not comparable. classes = mt.unique(y_true, aggregate_size=1).to_numpy( session=session, **(run_kwargs or dict()) ) if pos_label is None and ( classes.dtype.kind in ("O", "U", "S") or not ( np.array_equal(classes, [0, 1]) or np.array_equal(classes, [-1, 1]) or np.array_equal(classes, [0]) or np.array_equal(classes, [-1]) or np.array_equal(classes, [1]) ) ): classes_repr = ", ".join(repr(c) for c in classes) raise ValueError( f"y_true takes value in {{{classes_repr}}} and " "pos_label is not specified: either make y_true " "take value in {{0, 1}} or {{-1, 1}} or " "pass pos_label explicitly." ) elif pos_label is None: pos_label = 1.0 # make y_true a boolean vector y_true = y_true == pos_label # sort scores and corresponding truth values # original implementation adopted from sklearn: # """ # desc_score_indices = mt.argsort(y_score, kind="mergesort")[::-1] # y_score = y_score[desc_score_indices] # y_true = y_true[desc_score_indices] # if sample_weight is not None: # weight = sample_weight[desc_score_indices] # else: # weight = 1.0 # """ # since fancy indexing is a heavy operation, we try to use DataFrame to sort to_sort = [y_score, y_true] if sample_weight is not None: to_sort.append(sample_weight) to_sort = sort_by(to_sort, y_score, ascending=False) y_score, y_true = to_sort[:2] if sample_weight is not None: weight = to_sort[-1] else: weight = 1.0 # y_score typically has many tied values. Here we extract # the indices associated with the distinct values. We also # concatenate a value for the end of the curve. distinct_value_indices = mt.where(mt.diff(y_score))[0] threshold_idxs = mt.r_[distinct_value_indices, y_true.size - 1] # accumulate the true positives with decreasing threshold # raw tps from sklearn implementation # we try to perform only one fancy index # tps = (y_true * weight).cumsum()[threshold_idxs] temp_tps = (y_true * weight).cumsum() if sample_weight is not None: # express fps as a cumsum to ensure fps is increasing even in # the presence of floating point errors # fps = ((1 - y_true) * weight).cumsum()[threshold_idxs] temp_fps = ((1 - y_true) * weight).cumsum() tps, fps, thresholds = mt.stack([temp_tps, temp_fps, y_score])[ :, threshold_idxs ] else: tps, thresholds = mt.stack([temp_tps, y_score])[:, threshold_idxs] fps = 1 + threshold_idxs - tps return _execute([fps, tps, thresholds], session=session, **(run_kwargs or dict())) def _binary_roc_auc_score( y_true, y_score, sample_weight=None, max_fpr=None, session=None, run_kwargs=None ): """Binary roc auc score.""" from numpy import interp if len(mt.unique(y_true).execute()) != 2: raise ValueError( "Only one class present in y_true. ROC AUC score " "is not defined in that case." ) fpr, tpr, _ = roc_curve( y_true, y_score, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs, ) fpr, tpr = mt.ExecutableTuple([fpr, tpr]).fetch(session=session) if max_fpr is None or max_fpr == 1: return auc(fpr, tpr, session=session, run_kwargs=run_kwargs).fetch( session=session ) if max_fpr <= 0 or max_fpr > 1: raise ValueError(f"Expected max_fpr in range (0, 1], got: {max_fpr}") # Add a single point at max_fpr by linear interpolation stop = ( mt.searchsorted(fpr, max_fpr, "right") .execute(session=session, **(run_kwargs or dict())) .fetch(session=session) ) x_interp = [fpr[stop - 1], fpr[stop]] y_interp = [tpr[stop - 1], tpr[stop]] tpr = list(tpr[:stop]) tpr.append(interp(max_fpr, x_interp, y_interp)) fpr = list(fpr[:stop]) fpr.append(max_fpr) partial_auc = auc(fpr, tpr, session=session, run_kwargs=run_kwargs) # McClish correction: standardize result to be 0.5 if non-discriminant # and 1 if maximal min_area = 0.5 * max_fpr**2 max_area = max_fpr return 0.5 * ( 1 + (partial_auc.fetch(session=session) - min_area) / (max_area - min_area) )
[docs]def roc_auc_score( y_true, y_score, *, average="macro", sample_weight=None, max_fpr=None, multi_class="raise", labels=None, session=None, run_kwargs=None, ): """ Compute Area Under the Receiver Operating Characteristic Curve (ROC AUC) from prediction scores. Note: this implementation can be used with binary, multiclass and multilabel classification, but some restrictions apply (see Parameters). Read more in the :ref:`User Guide <roc_metrics>`. Parameters ---------- y_true : array-like of shape (n_samples,) or (n_samples, n_classes) True labels or binary label indicators. The binary and multiclass cases expect labels with shape (n_samples,) while the multilabel case expects binary label indicators with shape (n_samples, n_classes). y_score : array-like of shape (n_samples,) or (n_samples, n_classes) Target scores. * In the binary case, it corresponds to an array of shape `(n_samples,)`. Both probability estimates and non-thresholded decision values can be provided. The probability estimates correspond to the **probability of the class with the greater label**, i.e. `estimator.classes_[1]` and thus `estimator.predict_proba(X, y)[:, 1]`. The decision values corresponds to the output of `estimator.decision_function(X, y)`. See more information in the :ref:`User guide <roc_auc_binary>`; * In the multiclass case, it corresponds to an array of shape `(n_samples, n_classes)` of probability estimates provided by the `predict_proba` method. The probability estimates **must** sum to 1 across the possible classes. In addition, the order of the class scores must correspond to the order of ``labels``, if provided, or else to the numerical or lexicographical order of the labels in ``y_true``. See more information in the :ref:`User guide <roc_auc_multiclass>`; * In the multilabel case, it corresponds to an array of shape `(n_samples, n_classes)`. Probability estimates are provided by the `predict_proba` method and the non-thresholded decision values by the `decision_function` method. The probability estimates correspond to the **probability of the class with the greater label for each output** of the classifier. See more information in the :ref:`User guide <roc_auc_multilabel>`. average : {'micro', 'macro', 'samples', 'weighted'} or None, \ default='macro' If ``None``, the scores for each class are returned. Otherwise, this determines the type of averaging performed on the data: Note: multiclass ROC AUC currently only handles the 'macro' and 'weighted' averages. ``'micro'``: Calculate metrics globally by considering each element of the label indicator matrix as a label. ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. ``'weighted'``: Calculate metrics for each label, and find their average, weighted by support (the number of true instances for each label). ``'samples'``: Calculate metrics for each instance, and find their average. Will be ignored when ``y_true`` is binary. sample_weight : array-like of shape (n_samples,), default=None Sample weights. max_fpr : float > 0 and <= 1, default=None If not ``None``, the standardized partial AUC [2]_ over the range [0, max_fpr] is returned. For the multiclass case, ``max_fpr``, should be either equal to ``None`` or ``1.0`` as AUC ROC partial computation currently is not supported for multiclass. multi_class : {'raise', 'ovr', 'ovo'}, default='raise' Only used for multiclass targets. Determines the type of configuration to use. The default value raises an error, so either ``'ovr'`` or ``'ovo'`` must be passed explicitly. ``'ovr'``: Stands for One-vs-rest. Computes the AUC of each class against the rest [3]_ [4]_. This treats the multiclass case in the same way as the multilabel case. Sensitive to class imbalance even when ``average == 'macro'``, because class imbalance affects the composition of each of the 'rest' groupings. ``'ovo'``: Stands for One-vs-one. Computes the average AUC of all possible pairwise combinations of classes [5]_. Insensitive to class imbalance when ``average == 'macro'``. labels : array-like of shape (n_classes,), default=None Only used for multiclass targets. List of labels that index the classes in ``y_score``. If ``None``, the numerical or lexicographical order of the labels in ``y_true`` is used. Returns ------- auc : float References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ .. [2] `Analyzing a portion of the ROC curve. McClish, 1989 <https://www.ncbi.nlm.nih.gov/pubmed/2668680>`_ .. [3] Provost, F., Domingos, P. (2000). Well-trained PETs: Improving probability estimation trees (Section 6.2), CeDER Working Paper #IS-00-04, Stern School of Business, New York University. .. [4] `Fawcett, T. (2006). An introduction to ROC analysis. Pattern Recognition Letters, 27(8), 861-874. <https://www.sciencedirect.com/science/article/pii/S016786550500303X>`_ .. [5] `Hand, D.J., Till, R.J. (2001). A Simple Generalisation of the Area Under the ROC Curve for Multiple Class Classification Problems. Machine Learning, 45(2), 171-186. <http://link.springer.com/article/10.1023/A:1010920819831>`_ See Also -------- average_precision_score : Area under the precision-recall curve. roc_curve : Compute Receiver operating characteristic (ROC) curve. RocCurveDisplay.from_estimator : Plot Receiver Operating Characteristic (ROC) curve given an estimator and some data. RocCurveDisplay.from_predictions : Plot Receiver Operating Characteristic (ROC) curve given the true and predicted values. Examples -------- Binary case: >>> from sklearn.datasets import load_breast_cancer >>> from sklearn.linear_model import LogisticRegression >>> from mars.learn.metrics import roc_auc_score >>> X, y = load_breast_cancer(return_X_y=True) >>> clf = LogisticRegression(solver="liblinear", random_state=0).fit(X, y) >>> roc_auc_score(y, clf.predict_proba(X)[:, 1]) 0.99... >>> roc_auc_score(y, clf.decision_function(X)) 0.99... Multiclass case: >>> from sklearn.datasets import load_iris >>> X, y = load_iris(return_X_y=True) >>> clf = LogisticRegression(solver="liblinear").fit(X, y) >>> roc_auc_score(y, clf.predict_proba(X), multi_class='ovr') 0.99... Multilabel case: >>> import numpy as np >>> from sklearn.datasets import make_multilabel_classification >>> from sklearn.multioutput import MultiOutputClassifier >>> X, y = make_multilabel_classification(random_state=0) >>> clf = MultiOutputClassifier(clf).fit(X, y) >>> # get a list of n_output containing probability arrays of shape >>> # (n_samples, n_classes) >>> y_pred = clf.predict_proba(X) >>> # extract the positive columns for each output >>> y_pred = np.transpose([pred[:, 1] for pred in y_pred]) >>> roc_auc_score(y, y_pred, average=None) array([0.82..., 0.86..., 0.94..., 0.85... , 0.94...]) >>> from sklearn.linear_model import RidgeClassifierCV >>> clf = RidgeClassifierCV().fit(X, y) >>> roc_auc_score(y, clf.decision_function(X), average=None) array([0.81..., 0.84... , 0.93..., 0.87..., 0.94...]) """ cache_tileables(y_true, y_score) y_type = type_of_target(y_true) y_true = check_array(y_true, ensure_2d=False, dtype=None) y_score = check_array(y_score, ensure_2d=False) _execute([y_type, y_true, y_score], session=session, **(run_kwargs or dict())) y_type = y_type.fetch(session=session) def execute(*args): result = [None] * len(args) to_execute = dict() for i, arg in enumerate(args): if hasattr(arg, "op"): to_execute[i] = arg else: result[i] = arg if to_execute: _execute(*to_execute.values(), session=session, **(run_kwargs or dict())) for i, e in to_execute.items(): if e.isscalar(): e = e.fetch(session=session) result[i] = e return result[0] if len(result) == 1 else result if y_type == "multiclass" or ( y_type == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2 ): # do not support partial ROC computation for multiclass if max_fpr is not None and max_fpr != 1.0: raise ValueError( "Partial AUC computation not available in " "multiclass setting, 'max_fpr' must be" " set to `None`, received `max_fpr={0}` " "instead".format(max_fpr) ) if multi_class == "raise": raise ValueError("multi_class must be in ('ovo', 'ovr')") return execute( _multiclass_roc_auc_score( y_true, y_score, labels, multi_class, average, sample_weight ) ) elif y_type == "binary": labels = mt.unique(y_true).execute(session=session, **(run_kwargs or dict())) y_true = label_binarize(y_true, classes=labels, execute=False)[:, 0] cache_tileables(y_true) return execute( _average_binary_score( partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight, ) ) else: # multilabel-indicator return execute( _average_binary_score( partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight, ) )
def _multiclass_roc_auc_score( y_true, y_score, labels, multi_class, average, sample_weight, session=None, run_kwargs=None, ): # validation of the input y_score if not mt.allclose(1, y_score.sum(axis=1)).to_numpy( session=session, **(run_kwargs or dict()) ): # pragma: no cover raise ValueError( "Target scores need to be probabilities for multiclass " "roc_auc, i.e. they should sum up to 1.0 over classes" ) # validation for multiclass parameter specifications average_options = ("macro", "weighted") if average not in average_options: raise ValueError( "average must be one of {0} for multiclass problems".format(average_options) ) multiclass_options = ("ovo", "ovr") if multi_class not in multiclass_options: raise ValueError( "multi_class='{0}' is not supported " "for multiclass ROC AUC, multi_class must be " "in {1}".format(multi_class, multiclass_options) ) if labels is not None: labels = column_or_1d(labels).to_numpy( session=session, **(run_kwargs or dict()) ) classes = _unique(labels).to_numpy(session=session, **(run_kwargs or dict())) if len(classes) != len(labels): raise ValueError("Parameter 'labels' must be unique") if not np.array_equal(classes, labels): raise ValueError("Parameter 'labels' must be ordered") if len(classes) != y_score.shape[1]: raise ValueError( "Number of given labels, {0}, not equal to the number " "of columns in 'y_score', {1}".format(len(classes), y_score.shape[1]) ) if len( mt.setdiff1d(y_true, classes).execute( session=session, **(run_kwargs or dict()) ) ): raise ValueError("'y_true' contains labels not in parameter 'labels'") else: classes = _unique(y_true).execute(session=session, **(run_kwargs or dict())) if len(classes) != y_score.shape[1]: raise ValueError( "Number of classes in y_true not equal to the number of " "columns in 'y_score'" ) if multi_class == "ovo": if sample_weight is not None: raise ValueError( "sample_weight is not supported " "for multiclass one-vs-one ROC AUC, " "'sample_weight' must be None in this case." ) y_true_encoded = _encode(y_true, uniques=classes) # Hand & Till (2001) implementation (ovo) return _average_multiclass_ovo_score( _binary_roc_auc_score, y_true_encoded, y_score, average=average, session=session, run_kwargs=run_kwargs, ) else: # ovr is same as multi-label y_true_multilabel = label_binarize(y_true, classes=classes, execute=False) return _average_binary_score( _binary_roc_auc_score, y_true_multilabel, y_score, average, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs, )
[docs]def roc_curve( y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=True, session=None, run_kwargs=None, ): """Compute Receiver operating characteristic (ROC) Note: this implementation is restricted to the binary classification task. Read more in the :ref:`User Guide <roc_metrics>`. Parameters ---------- y_true : tensor, shape = [n_samples] True binary labels. If labels are not either {-1, 1} or {0, 1}, then pos_label should be explicitly given. y_score : tensor, shape = [n_samples] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). pos_label : int or str, default=None The label of the positive class. When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, ``pos_label`` is set to 1, otherwise an error will be raised. sample_weight : array-like of shape (n_samples,), default=None Sample weights. drop_intermediate : boolean, optional (default=True) Whether to drop some suboptimal thresholds which would not appear on a plotted ROC curve. This is useful in order to create lighter ROC curves. .. versionadded:: 0.17 parameter *drop_intermediate*. Returns ------- fpr : tensor, shape = [>2] Increasing false positive rates such that element i is the false positive rate of predictions with score >= thresholds[i]. tpr : tensor, shape = [>2] Increasing true positive rates such that element i is the true positive rate of predictions with score >= thresholds[i]. thresholds : tensor, shape = [n_thresholds] Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` represents no instances being predicted and is arbitrarily set to `max(y_score) + 1`. See also -------- roc_auc_score : Compute the area under the ROC curve Notes ----- Since the thresholds are sorted from low to high values, they are reversed upon returning them to ensure they correspond to both ``fpr`` and ``tpr``, which are sorted in reversed order during their calculation. References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition Letters, 2006, 27(8):861-874. Examples -------- >>> import mars.tensor as mt >>> from mars.learn import metrics >>> y = mt.array([1, 1, 2, 2]) >>> scores = mt.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) >>> fpr array([0. , 0. , 0.5, 0.5, 1. ]) >>> tpr array([0. , 0.5, 0.5, 1. , 1. ]) >>> thresholds array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ]) """ from sklearn.exceptions import UndefinedMetricWarning cache_tileables(y_true, y_score) fps, tps, thresholds = _binary_clf_curve( y_true, y_score, pos_label=pos_label, sample_weight=sample_weight, session=session, run_kwargs=run_kwargs, ) # Attempt to drop thresholds corresponding to points in between and # collinear with other points. These are always suboptimal and do not # appear on a plotted ROC curve (and thus do not affect the AUC). # Here mt.diff(_, 2) is used as a "second derivative" to tell if there # is a corner at the point. Both fps and tps must be tested to handle # thresholds with multiple data points (which are combined in # _binary_clf_curve). This keeps all cases where the point should be kept, # but does not drop more complicated cases like fps = [1, 3, 7], # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. if drop_intermediate and len(fps) > 2: optimal_idxs = mt.where( mt.r_[True, mt.logical_or(mt.diff(fps, 2), mt.diff(tps, 2)), True] )[0] # original implementation of sklearn: # """ # fps = fps[optimal_idxs] # tps = tps[optimal_idxs] # thresholds = thresholds[optimal_idxs] # """ # however, it's really a heavy operation to perform fancy index, # thus we put them together stacked = mt.stack([fps, tps, thresholds]) fps, tps, thresholds = stacked[:, optimal_idxs] # Add an extra threshold position # to make sure that the curve starts at (0, 0) tps = mt.r_[0, tps] fps = mt.r_[0, fps] thresholds = mt.r_[thresholds[0] + 1, thresholds] last_fps = fps[-1] last_tps = tps[-1] _execute( [tps, fps, last_fps, last_tps, thresholds], session=session, **(run_kwargs or dict()), ) last_fps, last_tps = _fetch([last_fps, last_tps], session=session) if last_fps <= 0: warnings.warn( "No negative samples in y_true, " "false positive value should be meaningless", UndefinedMetricWarning, ) fpr = mt.repeat(mt.nan, fps.shape) else: fpr = fps / last_fps if last_tps <= 0: warnings.warn( "No positive samples in y_true, " "true positive value should be meaningless", UndefinedMetricWarning, ) tpr = mt.repeat(mt.nan, tps.shape) else: tpr = tps / last_tps ret = mt.ExecutableTuple([fpr, tpr, thresholds]).execute( session=session, **(run_kwargs or dict()) ) return ret