Source code for ads.feature_engineering.accessor.dataframe_accessor

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The ADS accessor for the Pandas DataFrame.
The accessor will be initialized with the pandas object the user is interacting with.

Examples
--------
>>> from ads.feature_engineering.accessor.dataframe_accessor import ADSDataFrameAccessor
    >>> from ads.feature_engineering.feature_type.continuous import Continuous
    >>> from ads.feature_engineering.feature_type.creditcard import CreditCard
    >>> from ads.feature_engineering.feature_type.string import String
    >>> from ads.feature_engineering.feature_type.base import Tag
>>> df = pd.DataFrame({'Name': ['Alex'], 'CreditCard': ["4532640527811543"]})
>>> df.ads.feature_type
{'Name': ['string'], 'Credit Card': ['string']}
>>> df.ads.feature_type_description
          Column   Feature Type                        Description
------------------------------------------------------------------
0           Name         string    Type representing string values.
1    Credit Card         string    Type representing string values.
>>> df.ads.default_type
{'Name': 'string', 'Credit Card': 'string'}
>>> df.ads.feature_type = {'Name':['string', Tag('abc')]}
>>> df.ads.tags
{'Name': ['abc']}
>>> df.ads.feature_type = {'Credit Card':['credit_card']}
>>> df.ads.feature_select(include=['credit_card'])
                    Credit Card
-------------------------------
0	          4532640527811543
"""

from typing import Any, Dict, List, Union

import numpy as np
import pandas as pd
from ads.common.utils import DATA_SCHEMA_MAX_COL_NUM
from ads.data_labeling.mixin.data_labeling import DataLabelingAccessMixin
from ads.dbmixin.db_pandas_accessor import DBAccessMixin
from ads.feature_engineering import schema
from ads.feature_engineering.accessor.mixin.eda_mixin import EDAMixin
from ads.feature_engineering.accessor.mixin.feature_types_mixin import (
    ADSFeatureTypesMixin,
)
from ads.feature_engineering.feature_type.base import FeatureType
from pandas.core.dtypes.common import is_list_like


[docs]@pd.api.extensions.register_dataframe_accessor("ads")
class ADSDataFrameAccessor(
    ADSFeatureTypesMixin, EDAMixin, DBAccessMixin, DataLabelingAccessMixin
):
    """ADS accessor for the Pandas DataFrame.

    Attributes
    ----------
    columns: List[str]
        The column labels of the DataFrame.

    tags(self) -> Dict[str, str]
        Gets the dictionary of user defined tags for the dataframe.
    default_type(self) -> Dict[str, str]
        Gets the map of columns and associated default feature type names.
    feature_type(self) -> Dict[str, List[str]]
        Gets the list of registered feature types.
    feature_type_description(self) -> pd.DataFrame
        Gets the list of registered feature types in a DataFrame format.

    Methods
    -------
    sync(self, src: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame
        Syncs feature types of current DataFrame with that from src.
    feature_select(self, include: List[Union[FeatureType, str]] = None, exclude: List[Union[FeatureType, str]] = None) -> pd.DataFrame
        Gets the list of registered feature types in a DataFrame format.
    help(self, prop: str = None) -> None
        Provids docstring for affordable methods and properties.

    Examples
    --------
    >>> from ads.feature_engineering.accessor.dataframe_accessor import ADSDataFrameAccessor
    >>> from ads.feature_engineering.feature_type.continuous import Continuous
    >>> from ads.feature_engineering.feature_type.creditcard import CreditCard
    >>> from ads.feature_engineering.feature_type.string import String
    >>> from ads.feature_engineering.feature_type.base import Tag
    df = pd.DataFrame({'Name': ['Alex'], 'CreditCard': ["4532640527811543"]})
    >>> df.ads.feature_type
    {'Name': ['string'], 'Credit Card': ['string']}
    >>> df.ads.feature_type_description
              Column   Feature Type                        Description
    -------------------------------------------------------------------
    0           Name         string    Type representing string values.
    1    Credit Card         string    Type representing string values.
    >>> df.ads.default_type
    {'Name': 'string', 'Credit Card': 'string'}
    >>> df.ads.feature_type = {'Name':['string', Tag('abc')]}
    >>> df.ads.tags
    {'Name': ['abc']}
    >>> df.ads.feature_type = {'Credit Card':['credit_card']}
    >>> df.ads.feature_select(include=['credit_card'])
                       Credit Card
    ------------------------------
    0	          4532640527811543
    """

    def __init__(self, pandas_obj) -> None:
        """Initializes ADS Pandas DataFrame Accessor.

        Parameters
        ----------
        pandas_obj : pandas.DataFrame
            Pandas dataframe

        Raises
        ------
        ValueError
            If provided DataFrame has duplicate columns.
        """
        if len(set(pandas_obj.columns)) != len(pandas_obj.columns):
            raise ValueError(
                "Failed to initialize a DataFrame accessor. " "Duplicate column found."
            )
        self._obj = pandas_obj
        super().__init__()
        self.columns = self._obj.columns
        self._info = None

[docs]    def info(self) -> Any:
        """Gets information about the dataframe.

        Returns
        -------
        Any
            The information about the dataframe.
        """
        return self._info

    @property
    def _feature_type(self) -> Dict[str, List[FeatureType]]:
        """Gets the map of columns and associated feature types.
        Key is column name and value is list of feature types.
        """
        return {
            self._obj[col].name: self._obj[col].ads._feature_type for col in self._obj
        }

    @property
    def _default_type(self) -> Dict[str, FeatureType]:
        """Gets the map of columns and associated default feature types.
        Key is column name and value is a default feature type.
        """
        return {
            self._obj[col].name: self._obj[col].ads._default_type for col in self._obj
        }

    @property
    def tags(self) -> Dict[str, List[str]]:
        """Gets the dictionary of user defined tags for the dataframe. Key is column name
        and value is list of tag names.

        Returns
        -------
        Dict[str, List[str]]
            The map of columns and associated default tags.
        """
        return {self._obj[col].name: self._obj[col].ads.tags for col in self._obj}

    @property
    def default_type(self) -> Dict[str, str]:
        """Gets the map of columns and associated default feature type names.

        Returns
        -------
        Dict[str, str]
            The dictionary where key is column name and value is the name of default feature
            type.
        """
        return {k: v.name for k, v in self._default_type.items()}

    @property
    def feature_type(self) -> Dict[str, List[str]]:
        """Gets the list of registered feature types.

        Returns
        -------
        Dict[str, List[str]]
            The dictionary where key is column name and value is list of associated feature type
            names.
        """
        return {col.name: col.ads.feature_type for _, col in self._obj.items()}

    @property
    def feature_type_description(self) -> pd.DataFrame:
        """Gets the list of registered feature types in a DataFrame format.

        Returns
        -------
        :class:`pandas.DataFrame`

        Examples
        ________
        >>> df.ads.feature_type_description()
                  Column   Feature Type                         Description
        -------------------------------------------------------------------
        0           City         string    Type representing string values.
        1   Phone Number         string    Type representing string values.
        """
        result_df = pd.DataFrame([], columns=["Column", "Feature Type", "Description"])
        for col in self._obj:
            series_feature_type_df = self._obj[col].ads.feature_type_description
            series_feature_type_df.insert(0, "Column", col)
            result_df = result_df.append(series_feature_type_df)
        result_df.reset_index(drop=True, inplace=True)
        return result_df

    @feature_type.setter
    def feature_type(
        self, feature_type_map: Dict[str, List[Union[FeatureType, str]]]
    ) -> None:
        """Sets feature types for the DataFrame.

        Parameters
        ----------
        feature_type_map : Dict[str, List[Union[FeatureType, str]]]
            The map of feature types where key is column name and value is list of feature
            types.

        Returns
        -------
        None
            Nothing.
        """
        for col, feature_types in feature_type_map.items():
            self._obj[col].ads.feature_type = feature_types

[docs]    def sync(self, src: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame:
        """Syncs feature types of current DataFrame with that from src.

        Syncs feature types of current dataframe with that from src, where src
        can be a dataframe or a series. In either case, only columns with
        matched names are synced.

        Parameters
        ----------
        src: `pd.DataFrame` | `pd.Series`
            The source to sync from.

        Returns
        -------
        :class:`pandas.DataFrame`
            Synced dataframe.
        """
        for _, col in self._obj.items():
            col.ads.sync(src)

    def _extract_columns_of_target_types(
        self, target_types: List[Union[FeatureType, str]]
    ) -> List:
        """Returns all the column names that are of the target types from the
        feature_type dictionary.

        Parameters
        ----------
        target_types: list
            A list of target feature types, can be either feature type names of
            feature type class.

        Returns:
        -------
        List[str]
            The list of columns names.
        """
        columns = []
        target_types = (
            np.unique(
                [self._get_type(feature_type).name for feature_type in target_types]
            )
            if target_types is not None
            else None
        )
        for target_type in target_types:
            for name, feature_types in self.feature_type.items():
                if target_type in feature_types:
                    columns.append(name)
        return columns

[docs]    def feature_select(
        self,
        include: List[Union[FeatureType, str]] = None,
        exclude: List[Union[FeatureType, str]] = None,
    ) -> pd.DataFrame:
        """Returns a subset of the DataFrame’s columns based on the column feature_types.

        Parameters
        ----------
        include: List[Union[FeatureType, str]], optional
            Defaults to None. A list of FeatureType subclass or str to be included.
        exclude: List[Union[FeatureType, str]], optional
            Defaults to None. A list of FeatureType subclass or str to be excluded.

        Raises
        ------
        ValueError
            If both of include and exclude are empty
        ValueError
            If include and exclude are used simultaneously

        Returns
        -------
        :class:`pandas.DataFrame`
            The subset of the frame including the feature types in include and excluding
            the feature types in exclude.
        """
        if not (include or exclude):
            raise ValueError("at least one of include or exclude must be nonempty")

        if not is_list_like(include):
            include = (include,) if include is not None else ()
        if not is_list_like(exclude):
            exclude = (exclude,) if exclude is not None else ()

        # unify the feature types to str representation
        include = (
            np.unique([self._get_type(feature_type).name for feature_type in include])
            if include is not None
            else None
        )
        exclude = (
            np.unique([self._get_type(feature_type).name for feature_type in exclude])
            if exclude is not None
            else None
        )

        # convert the myriad valid dtypes object to a single representation
        include = frozenset(include)
        exclude = frozenset(exclude)

        # can't both include AND exclude!
        if not include.isdisjoint(exclude):
            raise ValueError(f"include and exclude overlap on {(include & exclude)}")

        # We raise when both include and exclude are empty
        # Hence, we can just shrink the columns we want to keep
        keep_these = np.full(self._obj.shape[1], True)

        columns = self._obj.columns

        if include:
            included_columns = self._extract_columns_of_target_types(include)
            keep_these &= columns.isin(included_columns)

        if exclude:
            excluded_columns = self._extract_columns_of_target_types(exclude)
            keep_these &= ~columns.isin(excluded_columns)

        return self._obj.loc[:, keep_these]

    def _add_feature_type(
        self, col: str, feature_type: Union[FeatureType, str]
    ) -> bool:
        """Adds a feature type

        Parameters
        ----------
        col : str
            The column name.
        feature_type : Union[FeatureType, str]
            The feature type to add.

        Returns
        -------
        bool
            Whether add succeeded.
        """
        if col not in self._obj.columns:
            raise ValueError(f"Column {col} is not found.")
        return self._obj[col].ads._add_feature_type(feature_type)

    def _remove_feature_type(
        self, col: str, feature_type: Union[FeatureType, str]
    ) -> None:
        """Removes a feature type

        Parameters
        ----------
        col : str
            column name
        feature_type : Union[FeatureType, str]
            feature type

        Returns
        -------
        None
            Nothing
        """
        if col not in self._obj.columns:
            raise ValueError(f"Column {col} is not found.")
        self._obj[col].ads._remove_feature_type(feature_type)

[docs]    def model_schema(self, max_col_num: int = DATA_SCHEMA_MAX_COL_NUM):
        """
        Generates schema from the dataframe.

        Parameters
        ----------
        max_col_num : int, optional. Defaults to 1000
            The maximum column size of the data that allows to auto generate schema.

        Examples
        --------
        >>> df = pd.read_csv('./orcl_attrition.csv', usecols=['Age', 'Attrition'])
        >>> schema = df.ads.model_schema()
        >>> schema
        Schema:
            - description: Attrition
            domain:
                constraints: []
                stats:
                count: 1470
                unique: 2
                values: String
            dtype: object
            feature_type: String
            name: Attrition
            required: true
            - description: Age
            domain:
                constraints: []
                stats:
                25%: 31.0
                50%: 37.0
                75%: 44.0
                count: 1470.0
                max: 61.0
                mean: 37.923809523809524
                min: 19.0
                std: 9.135373489136732
                values: Integer
            dtype: int64
            feature_type: Integer
            name: Age
            required: true
        >>> schema.to_dict()
        {'Schema': [{'dtype': 'object',
            'feature_type': 'String',
            'name': 'Attrition',
            'domain': {'values': 'String',
                'stats': {'count': 1470, 'unique': 2},
                'constraints': []},
            'required': True,
            'description': 'Attrition'},
            {'dtype': 'int64',
            'feature_type': 'Integer',
            'name': 'Age',
            'domain': {'values': 'Integer',
                'stats': {'count': 1470.0,
                'mean': 37.923809523809524,
                'std': 9.135373489136732,
                'min': 19.0,
                '25%': 31.0,
                '50%': 37.0,
                '75%': 44.0,
                'max': 61.0},
                'constraints': []},
            'required': True,
            'description': 'Age'}]}

        Returns
        -------
        ads.feature_engineering.schema.Schema
            data schema.

        Raises
        ------
        ads.feature_engineering.schema.DataSizeTooWide
            If the number of columns of input data exceeds `max_col_num`.
        """
        if max_col_num and len(self._obj.columns) > max_col_num:
            raise schema.DataSizeTooWide(
                data_col_num=len(self._obj.columns), max_col_num=max_col_num
            )

        sc = schema.Schema()
        for i, col in enumerate(self._obj.columns):
            domain = schema.Domain()
            try:
                domain = self._obj[col].ads.feature_domain()
            except:
                pass

            sc.add(
                schema.Attribute(
                    self._obj[col].dtype.name,
                    domain.values,
                    col,
                    domain=domain,
                    description=str(col),
                    required=bool(~self._obj[col].isnull().any()),
                    order=i,
                )
            )

        return sc

    def __getattr__(self, attr):
        attr_map = dict()
        for col in self._obj:
            try:
                val = self._obj[col].ads.__getattr__(attr)
            except:
                val = None  # if a column does not have the request attr, return None
            attr_map[col] = val

        if any(
            callable(x) for x in list(attr_map.values())
        ):  # check if attr is a callable, and if yes apply args to all cols.

            def func(*args, **kwargs):
                out = dict()
                for k, v in attr_map.items():
                    out[k] = v(*args, **kwargs) if v else None
                return out

            return func

        return attr_map