Source code for mars.learn.preprocessing._data

```# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#
# Unless required by applicable law or agreed to in writing, software
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and

import numpy as np
from sklearn.base import TransformerMixin
from sklearn.utils.validation import check_is_fitted, FLOAT_DTYPES

from ... import tensor as mt
from ...tensor.core import TENSOR_TYPE
from ..base import BaseEstimator
from ..utils.validation import check_array

def _handle_zeros_in_scale(scale, copy=True):
"""Makes sure that whenever scale is zero, we handle it correctly.

This happens in most scalers when we have constant features.
"""

# if we are fitting on 1D arrays, scale might be a scalar
if np.isscalar(scale):  # pragma: no cover
if scale == 0.0:
scale = 1.0
return scale
elif hasattr(scale, "ndim") and scale.ndim == 0:  # pragma: no cover
# scalar that is tensor
return mt.where(scale == 0.0, 1.0, scale)
elif isinstance(scale, (np.ndarray, TENSOR_TYPE)):
if copy:
# New array to avoid side-effects
scale = scale.copy()
scale[scale == 0.0] = 1.0
return scale

[docs]class MinMaxScaler(TransformerMixin, BaseEstimator):
"""Transform features by scaling each feature to a given range.

This estimator scales and translates each feature individually such
that it is in the given range on the training set, e.g. between
zero and one.

The transformation is given by::

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min

where min, max = feature_range.

This transformation is often used as an alternative to zero mean,
unit variance scaling.

Read more in the :ref:`User Guide <preprocessing_scaler>`.

Parameters
----------
feature_range : tuple (min, max), default=(0, 1)
Desired range of transformed data.

copy : bool, default=True
Set to False to perform inplace row normalization and avoid a
copy (if the input is already a numpy array).

clip: bool, default=False
Set to True to clip transformed values of held-out data to
provided `feature range`.

Attributes
----------
min_ : Tensor of shape (n_features,)
Per feature adjustment for minimum. Equivalent to
``min - X.min(axis=0) * self.scale_``

scale_ : Tensor of shape (n_features,)
Per feature relative scaling of the data. Equivalent to
``(max - min) / (X.max(axis=0) - X.min(axis=0))``

data_min_ : ndarray of shape (n_features,)
Per feature minimum seen in the data

data_max_ : ndarray of shape (n_features,)
Per feature maximum seen in the data

data_range_ : ndarray of shape (n_features,)
Per feature range ``(data_max_ - data_min_)`` seen in the data

n_samples_seen_ : int
The number of samples processed by the estimator.
It will be reset on new calls to fit, but increments across
``partial_fit`` calls.

Examples
--------
>>> from mars.learn.preprocessing import MinMaxScaler
>>> data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
>>> scaler = MinMaxScaler()
>>> print(scaler.fit(data))
MinMaxScaler()
>>> print(scaler.data_max_)
[ 1. 18.]
>>> print(scaler.transform(data))
[[0.   0.  ]
[0.25 0.25]
[0.5  0.5 ]
[1.   1.  ]]
>>> print(scaler.transform([[2, 2]]))
[[1.5 0. ]]

--------
minmax_scale : Equivalent function without the estimator API.

Notes
-----
NaNs are treated as missing values: disregarded in fit, and maintained in
transform.

For a comparison of the different scalers, transformers, and normalizers,
see :ref:`examples/preprocessing/plot_all_scaling.py
<sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
"""

[docs]    def __init__(self, feature_range=(0, 1), copy=True, clip=False):
self.feature_range = feature_range
self.copy = copy
self.clip = clip

def _reset(self):  # pragma: no cover
"""Reset internal data-dependent state of the scaler, if necessary.

__init__ parameters are not touched.
"""

# Checking one attribute is enough, because they are all set together
# in partial_fit
if hasattr(self, "scale_"):
del self.scale_
del self.min_
del self.n_samples_seen_
del self.data_min_
del self.data_max_
del self.data_range_

def fit(self, X, y=None, session=None, run_kwargs=None):
"""Compute the minimum and maximum to be used for later scaling.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data used to compute the per-feature minimum and maximum
used for later scaling along the features axis.

y : None
Ignored.

Returns
-------
self : object
Fitted scaler.
"""

# Reset internal state before fitting
self._reset()
return self.partial_fit(X, y, session=session, run_kwargs=run_kwargs)

def partial_fit(self, X, y=None, session=None, run_kwargs=None):
"""Online computation of min and max on X for later scaling.

All of X is processed as a single batch. This is intended for cases
when :meth:`fit` is not feasible due to very large number of
`n_samples` or because X is read from a continuous stream.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data used to compute the mean and standard deviation
used for later scaling along the features axis.

y : None
Ignored.

Returns
-------
self : object
Fitted scaler.
"""
feature_range = self.feature_range
if feature_range[0] >= feature_range[1]:
raise ValueError(
"Minimum of desired feature range must be smaller"
" than maximum. Got %s." % str(feature_range)
)

if mt.tensor(X).issparse():  # pragma: no cover
raise TypeError(
"MinMaxScaler does not support sparse input. "
)

first_pass = not hasattr(self, "n_samples_seen_")
X = self._validate_data(
X,
reset=first_pass,
estimator=self,
dtype=FLOAT_DTYPES,
force_all_finite="allow-nan",
)

if np.isnan(X.shape[0]):  # pragma: no cover
X.execute(session=session, **(run_kwargs or dict()))

data_min = mt.nanmin(X, axis=0)
data_max = mt.nanmax(X, axis=0)

if first_pass:
self.n_samples_seen_ = X.shape[0]
else:
data_min = mt.minimum(
self.data_min_, data_min
)  # pylint: disable=access-member-before-definition
data_max = mt.maximum(
self.data_max_, data_max
)  # pylint: disable=access-member-before-definition
self.n_samples_seen_ += X.shape[0]

data_range = data_max - data_min
self.scale_ = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
data_range
)
self.min_ = feature_range[0] - data_min * self.scale_
self.data_min_ = data_min
self.data_max_ = data_max
self.data_range_ = data_range
mt.ExecutableTuple(
[self.scale_, self.min_, self.data_min_, self.data_max_, self.data_range_]
).execute(session=session, **(run_kwargs or dict()))
return self

def transform(self, X, session=None, run_kwargs=None):
"""Scale features of X according to feature_range.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data that will be transformed.

Returns
-------
Xt : ndarray of shape (n_samples, n_features)
Transformed data.
"""
check_is_fitted(self)

X = self._validate_data(
X,
copy=self.copy,
dtype=FLOAT_DTYPES,
force_all_finite="allow-nan",
reset=False,
)

X *= self.scale_
X += self.min_
if self.clip:
X = mt.clip(X, self.feature_range[0], self.feature_range[1])
return X.execute(session=session, **(run_kwargs or dict()))

def inverse_transform(self, X, session=None, run_kwargs=None):
"""Undo the scaling of X according to feature_range.

Parameters
----------
X : array-like of shape (n_samples, n_features)
Input data that will be transformed. It cannot be sparse.

Returns
-------
Xt : ndarray of shape (n_samples, n_features)
Transformed data.
"""
check_is_fitted(self)

X = check_array(
X, copy=self.copy, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
)

X -= self.min_
X /= self.scale_
return X.execute(session=session, **(run_kwargs or dict()))

def _more_tags(self):  # pylint: disable=no-self-use
return {"allow_nan": True}

[docs]def minmax_scale(
X, feature_range=(0, 1), *, axis=0, copy=True, session=None, run_kwargs=None
):
"""Transform features by scaling each feature to a given range.

This estimator scales and translates each feature individually such
that it is in the given range on the training set, i.e. between
zero and one.

The transformation is given by (when ``axis=0``)::

X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min

where min, max = feature_range.

The transformation is calculated as (when ``axis=0``)::

X_scaled = scale * X + min - X.min(axis=0) * scale
where scale = (max - min) / (X.max(axis=0) - X.min(axis=0))

This transformation is often used as an alternative to zero mean,
unit variance scaling.

Read more in the :ref:`User Guide <preprocessing_scaler>`.

*minmax_scale* function interface
to :class:`~sklearn.preprocessing.MinMaxScaler`.

Parameters
----------
X : array-like of shape (n_samples, n_features)
The data.

feature_range : tuple (min, max), default=(0, 1)
Desired range of transformed data.

axis : int, default=0
Axis used to scale along. If 0, independently scale each feature,
otherwise (if 1) scale each sample.

copy : bool, default=True
Set to False to perform inplace scaling and avoid a copy (if the input

Returns
-------
X_tr : ndarray of shape (n_samples, n_features)
The transformed data.

.. warning:: Risk of data leak

Do not use :func:`~sklearn.preprocessing.minmax_scale` unless you know
what you are doing. A common mistake is to apply it to the entire data
*before* splitting into training and test sets. This will bias the
model evaluation because information would have leaked from the test
set to the training set.
In general, we recommend using
:class:`~sklearn.preprocessing.MinMaxScaler` within a
:ref:`Pipeline <pipeline>` in order to prevent most risks of data
leaking: `pipe = make_pipeline(MinMaxScaler(), LogisticRegression())`.

--------
MinMaxScaler : Performs scaling to a given range using the Transformer
API (e.g. as part of a preprocessing
:class:`~sklearn.pipeline.Pipeline`).

Notes
-----
For a comparison of the different scalers, transformers, and normalizers,
see :ref:`examples/preprocessing/plot_all_scaling.py
<sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
"""  # noqa
# Unlike the scaler object, this function allows 1d input.
# If copy is required, it will be done inside the scaler object.
X = check_array(
X, copy=False, ensure_2d=False, dtype=FLOAT_DTYPES, force_all_finite="allow-nan"
)
original_ndim = X.ndim

if original_ndim == 1:
X = X.reshape(X.shape[0], 1)

s = MinMaxScaler(feature_range=feature_range, copy=copy)
if axis == 0:
X = s.fit_transform(X)
else:
X = s.fit_transform(X.T).T

if original_ndim == 1:
X = X.ravel()

return X.execute(session=session, **(run_kwargs or dict()))
```