Source code for ads.feature_engineering.accessor.dataframe_accessor

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The ADS accessor for the Pandas DataFrame.
The accessor will be initialized with the pandas object the user is interacting with.

Examples
--------
>>> from ads.feature_engineering.accessor.dataframe_accessor import ADSDataFrameAccessor
    >>> from ads.feature_engineering.feature_type.continuous import Continuous
    >>> from ads.feature_engineering.feature_type.creditcard import CreditCard
    >>> from ads.feature_engineering.feature_type.string import String
    >>> from ads.feature_engineering.feature_type.base import Tag
>>> df = pd.DataFrame({'Name': ['Alex'], 'CreditCard': ["4532640527811543"]})
>>> df.ads.feature_type
{'Name': ['string'], 'Credit Card': ['string']}
>>> df.ads.feature_type_description
          Column   Feature Type                        Description
------------------------------------------------------------------
0           Name         string    Type representing string values.
1    Credit Card         string    Type representing string values.
>>> df.ads.default_type
{'Name': 'string', 'Credit Card': 'string'}
>>> df.ads.feature_type = {'Name':['string', Tag('abc')]}
>>> df.ads.tags
{'Name': ['abc']}
>>> df.ads.feature_type = {'Credit Card':['credit_card']}
>>> df.ads.feature_select(include=['credit_card'])
                    Credit Card
-------------------------------
0	          4532640527811543
"""

from typing import Any, Dict, List, Union

import numpy as np
import pandas as pd
from ads.common.utils import DATA_SCHEMA_MAX_COL_NUM
from ads.data_labeling.mixin.data_labeling import DataLabelingAccessMixin
from ads.dbmixin.db_pandas_accessor import DBAccessMixin
from ads.feature_engineering import schema
from ads.feature_engineering.accessor.mixin.eda_mixin import EDAMixin
from ads.feature_engineering.accessor.mixin.feature_types_mixin import (
    ADSFeatureTypesMixin,
)
from ads.feature_engineering.feature_type.base import FeatureType
from pandas.core.dtypes.common import is_list_like


[docs]@pd.api.extensions.register_dataframe_accessor("ads") class ADSDataFrameAccessor( ADSFeatureTypesMixin, EDAMixin, DBAccessMixin, DataLabelingAccessMixin ): """ADS accessor for the Pandas DataFrame. Attributes ---------- columns: List[str] The column labels of the DataFrame. tags(self) -> Dict[str, str] Gets the dictionary of user defined tags for the dataframe. default_type(self) -> Dict[str, str] Gets the map of columns and associated default feature type names. feature_type(self) -> Dict[str, List[str]] Gets the list of registered feature types. feature_type_description(self) -> pd.DataFrame Gets the list of registered feature types in a DataFrame format. Methods ------- sync(self, src: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame Syncs feature types of current DataFrame with that from src. feature_select(self, include: List[Union[FeatureType, str]] = None, exclude: List[Union[FeatureType, str]] = None) -> pd.DataFrame Gets the list of registered feature types in a DataFrame format. help(self, prop: str = None) -> None Provids docstring for affordable methods and properties. Examples -------- >>> from ads.feature_engineering.accessor.dataframe_accessor import ADSDataFrameAccessor >>> from ads.feature_engineering.feature_type.continuous import Continuous >>> from ads.feature_engineering.feature_type.creditcard import CreditCard >>> from ads.feature_engineering.feature_type.string import String >>> from ads.feature_engineering.feature_type.base import Tag df = pd.DataFrame({'Name': ['Alex'], 'CreditCard': ["4532640527811543"]}) >>> df.ads.feature_type {'Name': ['string'], 'Credit Card': ['string']} >>> df.ads.feature_type_description Column Feature Type Description ------------------------------------------------------------------- 0 Name string Type representing string values. 1 Credit Card string Type representing string values. >>> df.ads.default_type {'Name': 'string', 'Credit Card': 'string'} >>> df.ads.feature_type = {'Name':['string', Tag('abc')]} >>> df.ads.tags {'Name': ['abc']} >>> df.ads.feature_type = {'Credit Card':['credit_card']} >>> df.ads.feature_select(include=['credit_card']) Credit Card ------------------------------ 0 4532640527811543 """ def __init__(self, pandas_obj) -> None: """Initializes ADS Pandas DataFrame Accessor. Parameters ---------- pandas_obj : pandas.DataFrame Pandas dataframe Raises ------ ValueError If provided DataFrame has duplicate columns. """ if len(set(pandas_obj.columns)) != len(pandas_obj.columns): raise ValueError( "Failed to initialize a DataFrame accessor. " "Duplicate column found." ) self._obj = pandas_obj super().__init__() self.columns = self._obj.columns self._info = None
[docs] def info(self) -> Any: """Gets information about the dataframe. Returns ------- Any The information about the dataframe. """ return self._info
@property def _feature_type(self) -> Dict[str, List[FeatureType]]: """Gets the map of columns and associated feature types. Key is column name and value is list of feature types. """ return { self._obj[col].name: self._obj[col].ads._feature_type for col in self._obj } @property def _default_type(self) -> Dict[str, FeatureType]: """Gets the map of columns and associated default feature types. Key is column name and value is a default feature type. """ return { self._obj[col].name: self._obj[col].ads._default_type for col in self._obj } @property def tags(self) -> Dict[str, List[str]]: """Gets the dictionary of user defined tags for the dataframe. Key is column name and value is list of tag names. Returns ------- Dict[str, List[str]] The map of columns and associated default tags. """ return {self._obj[col].name: self._obj[col].ads.tags for col in self._obj} @property def default_type(self) -> Dict[str, str]: """Gets the map of columns and associated default feature type names. Returns ------- Dict[str, str] The dictionary where key is column name and value is the name of default feature type. """ return {k: v.name for k, v in self._default_type.items()} @property def feature_type(self) -> Dict[str, List[str]]: """Gets the list of registered feature types. Returns ------- Dict[str, List[str]] The dictionary where key is column name and value is list of associated feature type names. """ return {col.name: col.ads.feature_type for _, col in self._obj.items()} @property def feature_type_description(self) -> pd.DataFrame: """Gets the list of registered feature types in a DataFrame format. Returns ------- :class:`pandas.DataFrame` Examples ________ >>> df.ads.feature_type_description() Column Feature Type Description ------------------------------------------------------------------- 0 City string Type representing string values. 1 Phone Number string Type representing string values. """ result_df = pd.DataFrame([], columns=["Column", "Feature Type", "Description"]) for col in self._obj: series_feature_type_df = self._obj[col].ads.feature_type_description series_feature_type_df.insert(0, "Column", col) result_df = result_df.append(series_feature_type_df) result_df.reset_index(drop=True, inplace=True) return result_df @feature_type.setter def feature_type( self, feature_type_map: Dict[str, List[Union[FeatureType, str]]] ) -> None: """Sets feature types for the DataFrame. Parameters ---------- feature_type_map : Dict[str, List[Union[FeatureType, str]]] The map of feature types where key is column name and value is list of feature types. Returns ------- None Nothing. """ for col, feature_types in feature_type_map.items(): self._obj[col].ads.feature_type = feature_types
[docs] def sync(self, src: Union[pd.DataFrame, pd.Series]) -> pd.DataFrame: """Syncs feature types of current DataFrame with that from src. Syncs feature types of current dataframe with that from src, where src can be a dataframe or a series. In either case, only columns with matched names are synced. Parameters ---------- src: `pd.DataFrame` | `pd.Series` The source to sync from. Returns ------- :class:`pandas.DataFrame` Synced dataframe. """ for _, col in self._obj.items(): col.ads.sync(src)
def _extract_columns_of_target_types( self, target_types: List[Union[FeatureType, str]] ) -> List: """Returns all the column names that are of the target types from the feature_type dictionary. Parameters ---------- target_types: list A list of target feature types, can be either feature type names of feature type class. Returns: ------- List[str] The list of columns names. """ columns = [] target_types = ( np.unique( [self._get_type(feature_type).name for feature_type in target_types] ) if target_types is not None else None ) for target_type in target_types: for name, feature_types in self.feature_type.items(): if target_type in feature_types: columns.append(name) return columns
[docs] def feature_select( self, include: List[Union[FeatureType, str]] = None, exclude: List[Union[FeatureType, str]] = None, ) -> pd.DataFrame: """Returns a subset of the DataFrame’s columns based on the column feature_types. Parameters ---------- include: List[Union[FeatureType, str]], optional Defaults to None. A list of FeatureType subclass or str to be included. exclude: List[Union[FeatureType, str]], optional Defaults to None. A list of FeatureType subclass or str to be excluded. Raises ------ ValueError If both of include and exclude are empty ValueError If include and exclude are used simultaneously Returns ------- :class:`pandas.DataFrame` The subset of the frame including the feature types in include and excluding the feature types in exclude. """ if not (include or exclude): raise ValueError("at least one of include or exclude must be nonempty") if not is_list_like(include): include = (include,) if include is not None else () if not is_list_like(exclude): exclude = (exclude,) if exclude is not None else () # unify the feature types to str representation include = ( np.unique([self._get_type(feature_type).name for feature_type in include]) if include is not None else None ) exclude = ( np.unique([self._get_type(feature_type).name for feature_type in exclude]) if exclude is not None else None ) # convert the myriad valid dtypes object to a single representation include = frozenset(include) exclude = frozenset(exclude) # can't both include AND exclude! if not include.isdisjoint(exclude): raise ValueError(f"include and exclude overlap on {(include & exclude)}") # We raise when both include and exclude are empty # Hence, we can just shrink the columns we want to keep keep_these = np.full(self._obj.shape[1], True) columns = self._obj.columns if include: included_columns = self._extract_columns_of_target_types(include) keep_these &= columns.isin(included_columns) if exclude: excluded_columns = self._extract_columns_of_target_types(exclude) keep_these &= ~columns.isin(excluded_columns) return self._obj.loc[:, keep_these]
def _add_feature_type( self, col: str, feature_type: Union[FeatureType, str] ) -> bool: """Adds a feature type Parameters ---------- col : str The column name. feature_type : Union[FeatureType, str] The feature type to add. Returns ------- bool Whether add succeeded. """ if col not in self._obj.columns: raise ValueError(f"Column {col} is not found.") return self._obj[col].ads._add_feature_type(feature_type) def _remove_feature_type( self, col: str, feature_type: Union[FeatureType, str] ) -> None: """Removes a feature type Parameters ---------- col : str column name feature_type : Union[FeatureType, str] feature type Returns ------- None Nothing """ if col not in self._obj.columns: raise ValueError(f"Column {col} is not found.") self._obj[col].ads._remove_feature_type(feature_type)
[docs] def model_schema(self, max_col_num: int = DATA_SCHEMA_MAX_COL_NUM): """ Generates schema from the dataframe. Parameters ---------- max_col_num : int, optional. Defaults to 1000 The maximum column size of the data that allows to auto generate schema. Examples -------- >>> df = pd.read_csv('./orcl_attrition.csv', usecols=['Age', 'Attrition']) >>> schema = df.ads.model_schema() >>> schema Schema: - description: Attrition domain: constraints: [] stats: count: 1470 unique: 2 values: String dtype: object feature_type: String name: Attrition required: true - description: Age domain: constraints: [] stats: 25%: 31.0 50%: 37.0 75%: 44.0 count: 1470.0 max: 61.0 mean: 37.923809523809524 min: 19.0 std: 9.135373489136732 values: Integer dtype: int64 feature_type: Integer name: Age required: true >>> schema.to_dict() {'Schema': [{'dtype': 'object', 'feature_type': 'String', 'name': 'Attrition', 'domain': {'values': 'String', 'stats': {'count': 1470, 'unique': 2}, 'constraints': []}, 'required': True, 'description': 'Attrition'}, {'dtype': 'int64', 'feature_type': 'Integer', 'name': 'Age', 'domain': {'values': 'Integer', 'stats': {'count': 1470.0, 'mean': 37.923809523809524, 'std': 9.135373489136732, 'min': 19.0, '25%': 31.0, '50%': 37.0, '75%': 44.0, 'max': 61.0}, 'constraints': []}, 'required': True, 'description': 'Age'}]} Returns ------- ads.feature_engineering.schema.Schema data schema. Raises ------ ads.feature_engineering.schema.DataSizeTooWide If the number of columns of input data exceeds `max_col_num`. """ if max_col_num and len(self._obj.columns) > max_col_num: raise schema.DataSizeTooWide( data_col_num=len(self._obj.columns), max_col_num=max_col_num ) sc = schema.Schema() for i, col in enumerate(self._obj.columns): domain = schema.Domain() try: domain = self._obj[col].ads.feature_domain() except: pass sc.add( schema.Attribute( self._obj[col].dtype.name, domain.values, col, domain=domain, description=str(col), required=bool(~self._obj[col].isnull().any()), order=i, ) ) return sc
def __getattr__(self, attr): attr_map = dict() for col in self._obj: try: val = self._obj[col].ads.__getattr__(attr) except: val = None # if a column does not have the request attr, return None attr_map[col] = val if any( callable(x) for x in list(attr_map.values()) ): # check if attr is a callable, and if yes apply args to all cols. def func(*args, **kwargs): out = dict() for k, v in attr_map.items(): out[k] = v(*args, **kwargs) if v else None return out return func return attr_map