Source code for mars.dataframe.base.melt

# Copyright 1999-2020 Alibaba Group Holding Ltd.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
import pandas as pd

from ... import opcodes
from ...serialize.core import AnyField, StringField
from ..operands import DataFrameOperand, DataFrameOperandMixin, OutputType
from ..utils import build_empty_df, parse_index, standardize_range_index


class DataFrameMelt(DataFrameOperand, DataFrameOperandMixin):
    _op_type_ = opcodes.MELT

    _id_vars = AnyField('id_vars')
    _value_vars = AnyField('value_vars')
    _var_name = StringField('var_name')
    _value_name = StringField('value_name')
    _col_level = AnyField('col_level')

    def __init__(self, id_vars=None, value_vars=None, var_name=None, value_name=None,
                 col_level=None, **kw):
        super().__init__(_id_vars=id_vars, _value_vars=value_vars, _var_name=var_name,
                         _value_name=value_name, _col_level=col_level, **kw)

    @property
    def id_vars(self):
        return self._id_vars

    @property
    def value_vars(self):
        return self._value_vars

    @property
    def var_name(self):
        return self._var_name

    @property
    def value_name(self):
        return self._value_name

    @property
    def col_level(self):
        return self._col_level

    def __call__(self, df):
        empty_result = build_empty_df(df.dtypes).melt(id_vars=self.id_vars, value_vars=self.value_vars,
                                                      var_name=self.var_name, value_name=self.value_name,
                                                      col_level=self.col_level)
        self._output_types = [OutputType.dataframe]
        return self.new_tileable([df], shape=(np.nan, len(empty_result.columns)), dtypes=empty_result.dtypes,
                                 index_value=parse_index(pd.RangeIndex(-1), df.key, df.index_value.key),
                                 columns_value=parse_index(empty_result.columns, store_data=True))

    @classmethod
    def tile(cls, op: 'DataFrameMelt'):
        inp = op.inputs[0]
        out = op.outputs[0]

        inp = inp.rechunk({1: (inp.shape[1],)})._inplace_tile()

        chunks = []
        for c in inp.chunks:
            new_op = op.copy().reset_key()
            chunks.append(new_op.new_chunk(
                [c], index=c.index,  shape=(np.nan, out.shape[1]), dtypes=out.dtypes,
                index_value=parse_index(pd.RangeIndex(-1), c.key, c.index_value.key),
                columns_value=out.columns_value))

        chunks = standardize_range_index(chunks)
        new_op = op.copy().reset_key()
        return new_op.new_tileables(
            [inp], chunks=chunks, nsplits=((np.nan,) * inp.chunk_shape[0], (out.shape[1],)), **out.params)

    @classmethod
    def execute(cls, ctx, op: 'DataFrameMelt'):
        in_data = ctx[op.inputs[0].key]
        ctx[op.outputs[0].key] = in_data.melt(id_vars=op.id_vars, value_vars=op.value_vars,
                                              var_name=op.var_name, value_name=op.value_name,
                                              col_level=op.col_level)


[docs]def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None): """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. This function is useful to massage a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other columns, considered measured variables (`value_vars`), are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. .. versionadded:: 0.20.0 Parameters ---------- id_vars : tuple, list, or ndarray, optional Column(s) to use as identifier variables. value_vars : tuple, list, or ndarray, optional Column(s) to unpivot. If not specified, uses all columns that are not set as `id_vars`. var_name : scalar Name to use for the 'variable' column. If None it uses ``frame.columns.name`` or 'variable'. value_name : scalar, default 'value' Name to use for the 'value' column. col_level : int or str, optional If columns are a MultiIndex then use this level to melt. Returns ------- DataFrame Unpivoted DataFrame. See Also -------- melt pivot_table DataFrame.pivot Series.explode Examples -------- >>> import mars.dataframe as md >>> df = md.DataFrame({'A': {0: 'a', 1: 'b', 2: 'c'}, ... 'B': {0: 1, 1: 3, 2: 5}, ... 'C': {0: 2, 1: 4, 2: 6}}) >>> df.execute() A B C 0 a 1 2 1 b 3 4 2 c 5 6 >>> df.melt(id_vars=['A'], value_vars=['B']).execute() A variable value 0 a B 1 1 b B 3 2 c B 5 >>> df.melt(id_vars=['A'], value_vars=['B', 'C']).execute() A variable value 0 a B 1 1 b B 3 2 c B 5 3 a C 2 4 b C 4 5 c C 6 The names of 'variable' and 'value' columns can be customized: >>> df.melt(id_vars=['A'], value_vars=['B'], ... var_name='myVarname', value_name='myValname').execute() A myVarname myValname 0 a B 1 1 b B 3 2 c B 5 If you have multi-index columns: >>> df = md.DataFrame({('A', 'D'): {0: 'a', 1: 'b', 2: 'c'}, ... ('B', 'E'): {0: 1, 1: 3, 2: 5}, ... ('C', 'F'): {0: 2, 1: 4, 2: 6}}) >>> df.execute() A B C D E F 0 a 1 2 1 b 3 4 2 c 5 6 >>> df.melt(col_level=0, id_vars=['A'], value_vars=['B']).execute() A variable value 0 a B 1 1 b B 3 2 c B 5 >>> df.melt(id_vars=[('A', 'D')], value_vars=[('B', 'E')]).execute() (A, D) variable_0 variable_1 value 0 a B E 1 1 b B E 3 2 c B E 5 """ op = DataFrameMelt(id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name, col_level=col_level) return op(frame)