Source code for mars.tensor.core

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Copyright 1999-2021 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from collections.abc import Iterable
from enum import Enum
from operator import attrgetter
from typing import Any, Dict

import numpy as np

from ..core import (
    HasShapeTileable,
    ChunkData,
    Chunk,
    HasShapeTileableData,
    OutputType,
    register_output_types,
    _ExecuteAndFetchMixin,
    is_build_mode,
)
from ..core.entity.utils import refresh_tileable_shape
from ..serialization.serializables import (
    Serializable,
    FieldTypes,
    DataTypeField,
    ListField,
    TupleField,
    StringField,
    AnyField,
    ReferenceField,
)
from ..utils import on_serialize_shape, on_deserialize_shape
from .utils import get_chunk_slices, fetch_corner_data

logger = logging.getLogger(__name__)


class TensorOrder(Enum):
    # C order
    C_ORDER = "C"
    # Fortran order
    F_ORDER = "F"


class TensorChunkData(ChunkData):
    __slots__ = ()
    _no_copy_attrs_ = ChunkData._no_copy_attrs_ | {"dtype"}
    type_name = "Tensor"

    # required fields
    _shape = TupleField(
        "shape",
        FieldTypes.int64,
        on_serialize=on_serialize_shape,
        on_deserialize=on_deserialize_shape,
    )
    _order = ReferenceField("order", TensorOrder)
    # optional fields
    _dtype = DataTypeField("dtype")

    def __init__(self, op=None, index=None, shape=None, dtype=None, order=None, **kw):
        if isinstance(order, str):
            order = getattr(TensorOrder, order)
        super().__init__(
            _op=op, _index=index, _shape=shape, _dtype=dtype, _order=order, **kw
        )
        if self.order is None and self.op is not None:
            if len(self.inputs) == 0:
                self._order = TensorOrder.C_ORDER
            elif all(
                hasattr(inp, "order") and inp.order == TensorOrder.F_ORDER
                for inp in self.inputs
            ):
                self._order = TensorOrder.F_ORDER
            else:
                self._order = TensorOrder.C_ORDER

    @property
    def params(self) -> Dict[str, Any]:
        # params return the properties which useful to rebuild a new chunk
        return {
            "shape": self.shape,
            "dtype": self.dtype,
            "order": self.order,
            "index": self.index,
        }

    @params.setter
    def params(self, new_params: Dict[str, Any]):
        params = new_params.copy()
        params.pop("index", None)  # index not needed to update
        new_shape = params.pop("shape", None)
        if new_shape is not None:
            self._shape = new_shape
        dtype = params.pop("dtype", None)
        if dtype is not None:
            self._dtype = dtype
        order = params.pop("order", None)
        if order is not None:
            self._order = order
        if params:  # pragma: no cover
            raise TypeError(f"Unknown params: {list(params)}")

    @classmethod
    def get_params_from_data(cls, data: np.ndarray) -> Dict[str, Any]:
        from .array_utils import is_cupy

        if not is_cupy(data):
            data = np.asarray(data)
        order = (
            TensorOrder.C_ORDER if data.flags["C_CONTIGUOUS"] else TensorOrder.F_ORDER
        )
        return {"shape": data.shape, "dtype": data.dtype, "order": order}

    def __len__(self):
        try:
            return self.shape[0]
        except IndexError:
            if is_build_mode():
                return 0
            raise TypeError("len() of unsized object")

    @property
    def shape(self):
        return getattr(self, "_shape", None)

    @property
    def ndim(self):
        return len(self.shape)

    @property
    def size(self):
        return np.prod(self.shape).item()

    @property
    def dtype(self):
        return getattr(self, "_dtype", None) or self.op.dtype

    @property
    def order(self):
        return getattr(self, "_order", None)

    @property
    def nbytes(self):
        return np.prod(self.shape) * self.dtype.itemsize


class TensorChunk(Chunk):
    __slots__ = ()
    _allow_data_type_ = (TensorChunkData,)
    type_name = "Tensor"

    def __len__(self):
        return len(self._data)


class TensorData(HasShapeTileableData, _ExecuteAndFetchMixin):
    __slots__ = ()
    type_name = "Tensor"

    # required fields
    _order = StringField(
        "order", on_serialize=attrgetter("value"), on_deserialize=TensorOrder
    )
    # optional fields
    _dtype = DataTypeField("dtype")
    _chunks = ListField(
        "chunks",
        FieldTypes.reference(TensorChunkData),
        on_serialize=lambda x: [it.data for it in x] if x is not None else x,
        on_deserialize=lambda x: [TensorChunk(it) for it in x] if x is not None else x,
    )

    def __init__(
        self,
        op=None,
        shape=None,
        dtype=None,
        order=None,
        nsplits=None,
        chunks=None,
        **kw,
    ):
        if isinstance(order, str):
            order = getattr(TensorOrder, order)
        super().__init__(
            _op=op,
            _shape=shape,
            _dtype=dtype,
            _order=order,
            _nsplits=nsplits,
            _chunks=chunks,
            **kw,
        )
        if self.order is None and self.op is not None:
            if len(self.inputs) == 0:
                self._order = TensorOrder.C_ORDER
            elif all(
                hasattr(inp, "order") and inp.order == TensorOrder.F_ORDER
                for inp in self.inputs
            ):
                self._order = TensorOrder.F_ORDER
            else:
                self._order = TensorOrder.C_ORDER

    def _to_str(self, representation=False):
        if is_build_mode() or len(self._executed_sessions) == 0:
            # in build mode, or not executed, just return representation
            if representation:
                return f"Tensor <op={type(self._op).__name__}, shape={self._shape}, key={self._key}>"
            else:
                return f"Tensor(op={type(self._op).__name__}, shape={self._shape})"
        else:
            print_options = np.get_printoptions()
            threshold = print_options["threshold"]

            corner_data = fetch_corner_data(self, session=self._executed_sessions[-1])
            # if less than default threshold, just set it as default,
            # if not, set to corner_data.size - 1 make sure ... exists in repr
            threshold = threshold if self.size <= threshold else corner_data.size - 1
            with np.printoptions(threshold=threshold):
                corner_str = repr(corner_data) if representation else str(corner_data)
            return corner_str

    def __str__(self):
        return self._to_str(representation=False)

    def __repr__(self):
        return self._to_str(representation=True)

    @property
    def params(self):
        # params return the properties which useful to rebuild a new tileable object
        return {"shape": self.shape, "dtype": self.dtype, "order": self.order}

    @params.setter
    def params(self, new_params: Dict[str, Any]):
        params = new_params.copy()
        shape = params.pop("shape", None)
        if shape is not None:
            self._shape = shape
        dtype = params.pop("dtype", None)
        if dtype is not None:
            self._dtype = dtype
        order = params.pop("order", None)
        if order is not None:
            self._order = order
        if params:  # pragma: no cover
            raise TypeError(f"Unknown params: {list(params)}")

    def refresh_params(self):
        refresh_tileable_shape(self)
        if self._dtype is None:
            self._dtype = self.chunks[0].dtype

    @property
    def flags(self):
        c_order = True if self.ndim <= 1 else self.order == TensorOrder.C_ORDER
        f_order = True if self.ndim <= 1 else self.order == TensorOrder.F_ORDER
        return {"C_CONTIGUOUS": c_order, "F_CONTIGUOUS": f_order}

    @property
    def real(self):
        from .arithmetic import real

        return real(self)

    @property
    def imag(self):
        from .arithmetic import imag

        return imag(self)

    @property
    def dtype(self):
        return getattr(self, "_dtype", None) or self.op.dtype

    @property
    def order(self):
        return getattr(self, "_order", None)

    @property
    def nbytes(self):
        return np.prod(self.shape) * self.dtype.itemsize

    def get_chunk_slices(self, idx):
        return get_chunk_slices(self.nsplits, idx)

    def is_scalar(self):
        return self.ndim == 0

    isscalar = is_scalar

    def tosparse(self, missing=None):
        if self.issparse():
            return self

        from .datasource import fromdense

        return fromdense(self, missing=missing)

    def todense(self, fill_value=None):
        if not self.issparse():
            return self

        from .datasource import fromsparse

        return fromsparse(self, fill_value=fill_value)

    def transpose(self, *axes):
        from .base import transpose

        if len(axes) == 1 and isinstance(axes[0], Iterable):
            axes = axes[0]

        return transpose(self, axes)

    @property
    def T(self):
        return self.transpose()

    def reshape(self, shape, *shapes, **kw):
        from .reshape import reshape

        order = kw.pop("order", "C")
        if kw:
            raise TypeError(
                f"'{next(iter(kw))}' is an invalid keyword argument for this function"
            )

        if isinstance(shape, Iterable):
            shape = tuple(shape)
        else:
            shape = (shape,)
        shape += shapes

        return reshape(self, shape, order=order)

    def totiledb(self, uri, ctx=None, key=None, timestamp=None):
        from .datastore import totiledb

        return totiledb(uri, self, ctx=ctx, key=key, timestamp=timestamp)

    @staticmethod
    def from_dataframe(in_df):
        from .datasource import from_dataframe

        return from_dataframe(in_df)

    def to_dataframe(self, *args, **kwargs):
        from ..dataframe.datasource.from_tensor import dataframe_from_tensor

        return dataframe_from_tensor(self, *args, **kwargs)

    @property
    def flat(self):
        return flatiter(self)

    def to_numpy(self, session=None, **kw):
        return self._execute_and_fetch(session=session, **kw)


class Tensor(HasShapeTileable):
    __slots__ = ()
    _allow_data_type_ = (TensorData,)
    type_name = "Tensor"

    def __len__(self):
        return len(self._data)

    @property
    def shape(self):
        return self._data.shape

    @shape.setter
    def shape(self, new_shape):
        self._data = self._data.reshape(new_shape).data

    def _update_shape(self, new_shape):
        self._data._update_shape(new_shape)

    @property
    def real(self):
        return self.data.real

    @real.setter
    def real(self, new_real):
        from .arithmetic.setreal import set_real

        self._data = set_real(self._data, new_real).data

    @property
    def imag(self):
        return self.data.imag

    @imag.setter
    def imag(self, new_imag):
        from .arithmetic.setimag import set_imag

        self._data = set_imag(self._data, new_imag).data

    def __array__(self, dtype=None):
        return np.asarray(self.to_numpy(), dtype=dtype)

    def __array_function__(self, func, types, args, kwargs):
        from .. import tensor as module

        for submodule in func.__module__.split(".")[1:]:
            try:
                module = getattr(module, submodule)
            except AttributeError:
                return NotImplemented
        if not hasattr(module, func.__name__):
            return NotImplemented
        mars_func = getattr(module, func.__name__)
        if mars_func is func:
            # avoid Numpy func
            return NotImplemented
        return mars_func(*args, **kwargs)

    def view(self):
        return self._view()

    @property
    def ndim(self):
        """
        Number of array dimensions.

        Examples
        --------
        >>> import mars.tensor as mt
        >>> x = mt.array([1, 2, 3])
        >>> x.ndim
        1
        >>> y = mt.zeros((2, 3, 4))
        >>> y.ndim
        3
        """
        return super().ndim

    def transpose(self, *axes):
        """
        Returns a view of the tensor with axes transposed.

        For a 1-D tensor, this has no effect. (To change between column and
        row vectors, first cast the 1-D tensor into a matrix object.)
        For a 2-D tensor, this is the usual matrix transpose.
        For an n-D tensor, if axes are given, their order indicates how the
        axes are permuted (see Examples). If axes are not provided and
        ``a.shape = (i[0], i[1], ... i[n-2], i[n-1])``, then
        ``a.transpose().shape = (i[n-1], i[n-2], ... i[1], i[0])``.

        Parameters
        ----------
        axes : None, tuple of ints, or `n` ints

         * None or no argument: reverses the order of the axes.

         * tuple of ints: `i` in the `j`-th place in the tuple means `a`'s
           `i`-th axis becomes `a.transpose()`'s `j`-th axis.

         * `n` ints: same as an n-tuple of the same ints (this form is
           intended simply as a "convenience" alternative to the tuple form)

        Returns
        -------
        out : Tensor
            View of `a`, with axes suitably permuted.

        See Also
        --------
        Tensor.T : Tensor property returning the tensor transposed.

        Examples
        --------
        >>> import mars.tensor as mt

        >>> a = mt.array([[1, 2], [3, 4]])
        >>> a.execute()
        array([[1, 2],
               [3, 4]])
        >>> a.transpose().execute()
        array([[1, 3],
               [2, 4]])
        >>> a.transpose((1, 0))
        array([[1, 3],
               [2, 4]])
        >>> a.transpose(1, 0).execute()
        array([[1, 3],
               [2, 4]])
        """
        return self._data.transpose(*axes)

    @property
    def T(self):
        """
        Same as self.transpose(), except that self is returned if
        self.ndim < 2.

        Examples
        --------
        >>> import mars.tensor as mt

        >>> x = mt.array([[1.,2.],[3.,4.]])
        >>> x.execute()
        array([[ 1.,  2.],
               [ 3.,  4.]])
        >>> x.T.execute()
        array([[ 1.,  3.],
               [ 2.,  4.]])
        >>> x = mt.array([1.,2.,3.,4.])
        >>> x.execute()
        array([ 1.,  2.,  3.,  4.])
        >>> x.T.execute()
        array([ 1.,  2.,  3.,  4.])
        """
        return self._data.T

    def totiledb(self, uri, ctx=None, key=None, timestamp=None):
        return self._data.totiledb(uri, ctx=ctx, key=key, timestamp=timestamp)

    def copy(self, order="C"):
        return super().copy().astype(self.dtype, order=order, copy=False)

[docs] def sort(self, axis=-1, kind=None, parallel_kind=None, psrs_kinds=None, order=None): """ Sort a tensor, in-place. Parameters ---------- axis : int, optional Axis along which to sort. Default is -1, which means sort along the last axis. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional Sorting algorithm. Default is 'quicksort'. parallel_kind: {'PSRS'}, optional Parallel sorting algorithm, for the details, refer to: http://csweb.cs.wfu.edu/bigiron/LittleFE-PSRS/build/html/PSRSalgorithm.html psrs_kinds: list with 3 elements, optional Sorting algorithms during PSRS algorithm. order : str or list of str, optional When `a` is a tensor with fields defined, this argument specifies which fields to compare first, second, etc. A single field can be specified as a string, and not all fields need be specified, but unspecified fields will still be used, in the order in which they come up in the dtype, to break ties. See Also -------- numpy.sort : Return a sorted copy of a tensor. argsort : Indirect sort. lexsort : Indirect stable sort on multiple keys. searchsorted : Find elements in sorted tensor. partition: Partial sort. Notes ----- See ``sort`` for notes on the different sorting algorithms. Examples -------- >>> import mars.tensor as mt >>> a = mt.array([[1,4], [3,1]]) >>> a.sort(axis=1) >>> a.execute() array([[1, 4], [1, 3]]) >>> a.sort(axis=0) >>> a.execute() array([[1, 3], [1, 4]]) Use the `order` keyword to specify a field to use when sorting a structured tensor: >>> a = mt.array([('a', 2), ('c', 1)], dtype=[('x', 'S1'), ('y', int)]) >>> a.sort(order='y') >>> a.execute() array([('c', 1), ('a', 2)], dtype=[('x', '|S1'), ('y', '<i4')]) """ from .base import sort self._data = sort( self, axis=axis, kind=kind, parallel_kind=parallel_kind, psrs_kinds=psrs_kinds, order=order, ).data
def partition(self, kth, axis=-1, kind="introselect", order=None, **kw): """ Rearranges the elements in the tensor in such a way that the value of the element in kth position is in the position it would be in a sorted tensor. All elements smaller than the kth element are moved before this element and all equal or greater are moved behind it. The ordering of the elements in the two partitions is undefined. Parameters ---------- kth : int or sequence of ints Element index to partition by. The kth element value will be in its final sorted position and all smaller elements will be moved before it and all equal or greater elements behind it. The order of all elements in the partitions is undefined. If provided with a sequence of kth it will partition all elements indexed by kth of them into their sorted position at once. axis : int, optional Axis along which to sort. Default is -1, which means sort along the last axis. kind : {'introselect'}, optional Selection algorithm. Default is 'introselect'. order : str or list of str, optional When `a` is a tensor with fields defined, this argument specifies which fields to compare first, second, etc. A single field can be specified as a string, and not all fields need to be specified, but unspecified fields will still be used, in the order in which they come up in the dtype, to break ties. See Also -------- mt.partition : Return a partitioned copy of an tensor. argpartition : Indirect partition. sort : Full sort. Notes ----- See ``mt.partition`` for notes on the different algorithms. Examples -------- >>> import mars.tensor as mt >>> a = mt.array([3, 4, 2, 1]) >>> a.partition(3) >>> a.execute() array([2, 1, 3, 4]) >>> a.partition((1, 3)) >>> a.execute() array([1, 2, 3, 4]) """ from .base import partition self._data = partition(self, kth, axis=axis, kind=kind, order=order, **kw).data @property def flat(self): """ Flat iterator object to iterate over arrays. A `flatiter` iterator is returned by ``x.flat`` for any tensor `x`. It allows iterating over the tensor as if it were a 1-D array, either in a for-loop or by calling its `next` method. Iteration is done in row-major, C-style order (the last index varying the fastest). The iterator can also be indexed using basic slicing or advanced indexing. See Also -------- Tensor.flat : Return a flat iterator over a tensor. Tensor.flatten : Returns a flattened copy of a tensor. Examples -------- >>> import mars.tensor as mt >>> x = mt.arange(6).reshape(2, 3) >>> fl = x.flat >>> fl[2:4].execute() array([2, 3]) """ return self._data.flat def from_dataframe(self, in_df): return self._data.from_dataframe(in_df) def to_dataframe(self, *args, **kwargs): return self._data.to_dataframe(*args, **kwargs) def to_numpy(self, session=None, **kw): return self._data.to_numpy(session, **kw) SparseTensor = Tensor class flatiter(object): def __init__(self, tensor): # flatten creates a copy self._flatten_tensor = tensor.flatten() # ravel creates a view self._ravel_tensor = tensor.ravel() def __getitem__(self, item): # a.flat[item] create a copy return self._flatten_tensor[item] def __setitem__(self, key, value): # a.flat[item] = value will apply changes to original tensor self._ravel_tensor[key] = value class Indexes(Serializable): indexes = AnyField("indexes") TENSOR_TYPE = (Tensor, TensorData) TENSOR_CHUNK_TYPE = (TensorChunk, TensorChunkData) register_output_types(OutputType.tensor, TENSOR_TYPE, TENSOR_CHUNK_TYPE) register_output_types(OutputType.scalar, TENSOR_TYPE, TENSOR_CHUNK_TYPE)