Source code for graviti.dataframe.row.series

#!/usr/bin/env python3
#
# Copyright 2022 Graviti. Licensed under MIT License.
#

"""The implementation of the Graviti Series."""

from itertools import chain, zip_longest
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, overload

from graviti.dataframe.row.indexing import RowSeriesILocIndexer, RowSeriesLocIndexer
from graviti.utility import MAX_REPR_ROWS


[docs]class Series:
    """One-dimensional array.

    Arguments:
        data: The data that needs to be stored in Series. Could be ndarray or Iterable.
        schema: Data type to force. If None, will be inferred from ``data``.
        index: Index of the ``data``.

    Examples:
        Constructing Series from a list.

        >>> d = {"filename": "a.jpg", "attributes": {"color": "red", "pose": "frontal"}}
        >>> series = Series(data=d)
        >>> series
        filename           a.jpg
        attributes color     red
                    pose frontal

    """

    _indices_data: Dict[str, Any]
    _indices: List[str]

    def __init__(
        self,
        data: Optional[Dict[str, Any]] = None,
        schema: Any = None,
        index: Optional[List[str]] = None,
    ) -> None:
        if data is None:
            data = {}
        if schema is not None:
            # TODO: missing schema processing
            pass
        if index is not None:
            # TODO: missing index processing
            pass

        self._indices_data, self._indices = {}, []
        for key, value in data.items():
            if isinstance(value, dict):
                value = Series(value)
            self._indices_data[key] = value
            self._indices.append(key)

    def __repr__(self) -> str:
        flatten_header, flatten_data = self._flatten()
        header = self._get_repr_header(flatten_header)
        body = [
            item._repr_folding() if hasattr(item, "_repr_folding") else str(item)
            for item in flatten_data
        ]
        column_widths = [len(max(row, key=len)) for row in chain(header, [body])]
        lines = [
            "".join(f"{item:<{column_widths[index]+2}}" for index, item in enumerate(line))
            for line in zip_longest(*header, body)
        ]
        if self.__len__() > MAX_REPR_ROWS:
            lines.append(f"...({self.__len__()})")
        return "\n".join(lines)

    # @overload
    # def __getitem__(self, key: Union[slice, Tuple[str]]) -> "Series":
    #    ...

    @overload
    def __getitem__(self, key: str) -> Any:
        ...

    @overload
    def __getitem__(self, key: Iterable[str]) -> "Series":
        ...

    def __getitem__(self, key: Union[str, Iterable[str]]) -> Any:
        if isinstance(key, str):
            return self._indices_data[key]

        new_data = {name: self._indices_data[name] for name in key}
        return Series(new_data)

    def __setitem__(self, key: str, value: Any) -> None:
        pass

    def __len__(self) -> int:
        return self._indices.__len__()

    @staticmethod
    def _get_repr_header(flatten_header: List[Tuple[str, ...]]) -> List[List[str]]:
        lines: List[List[str]] = []
        for names in zip_longest(*flatten_header, fillvalue=""):
            line = []
            pre_name = None
            upper_line = lines[-1][1:] if lines else []
            for name, upper_name in zip_longest(names, upper_line, fillvalue=""):
                if name == pre_name and upper_name == "":
                    line.append("")
                else:
                    line.append(name)
                pre_name = name
            lines.append(line)
        return lines

    @classmethod
    def _construct(cls, indices_data: Dict[str, Any]) -> "Series":
        obj: Series = object.__new__(cls)
        obj._indices_data = indices_data
        obj._indices = list(indices_data.keys())
        return obj

    def _flatten(self) -> Tuple[List[Tuple[str, ...]], List[Any]]:
        header: List[Tuple[str, ...]] = []
        data: List[Any] = []
        for key, value in self._indices_data.items():
            if isinstance(value, Series):
                nested_header, nested_data = value._flatten()  # pylint: disable=protected-access
                header.extend((key, *sub_column) for sub_column in nested_header)
                data.extend(nested_data)
            else:
                data.append(value)
                header.append((key,))
        return header, data

    # @overload
    # def _getitem_by_location(self, key: slice) -> "Series":
    #    ...

    @overload
    def _getitem_by_location(self, key: int) -> Union["Series", Any]:
        ...

    @overload
    def _getitem_by_location(self, key: Iterable[int]) -> "Series":
        ...

    def _getitem_by_location(self, key: Union[int, Iterable[int]]) -> Any:
        if isinstance(key, int):
            return self._indices_data[self._indices[key]]

        indices_data = {
            self._indices[index]: self._indices_data[self._indices[index]] for index in key
        }
        return self._construct(indices_data)

    @property
[docs]    def iloc(self) -> RowSeriesILocIndexer:
        """Purely integer-location based indexing for selection by position.

        Allowed inputs are:

        - An integer, e.g. ``5``.
        - A list or array of integers, e.g. ``[4, 3, 0]``.
        - A slice object with ints, e.g. ``1:7``.
        - A boolean array of the same length as the axis being sliced.

        Returns:
            The instance of the ILocIndexer.

        Examples:
            >>> series = Series([1, 2, 3])
            >>> series.loc[0]
            1
            >>> df.loc[[0]]
            0    1
            dtype: int64

        """
        return RowSeriesILocIndexer(self)

    @property
[docs]    def loc(self) -> RowSeriesLocIndexer:
        """Access a group of rows and columns by indexes or a boolean array.

        Allowed inputs are:

        - A single index, e.g. ``5``.
        - A list or array of indexes, e.g. ``[4, 3, 0]``.
        - A slice object with indexes, e.g. ``1:7``.
        - A boolean array of the same length as the axis being sliced.

        Returns:
            The instance of the LocIndexer.

        Examples:
            >>> series = Series([1, 2, 3])
            >>> series.loc[0]
            1
            >>> df.loc[[0]]
            0    1
            dtype: int64

        """
        return RowSeriesLocIndexer(self)