Source code for ads.feature_engineering.accessor.series_accessor

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The ADS accessor for the Pandas Series.
The accessor will be initialized with the pandas object the user is interacting with.

Examples
--------
    >>> from ads.feature_engineering.accessor.series_accessor import ADSSeriesAccessor
    >>> from ads.feature_engineering.feature_type.string import String
    >>> from ads.feature_engineering.feature_type.ordinal import Ordinal
    >>> from ads.feature_engineering.feature_type.base import Tag
    >>> series = pd.Series(['name1', 'name2', 'name3'])
    >>> series.ads.default_type
    'string'
    >>> series.ads.feature_type
    ['string']
    >>> series.ads.feature_type_description
        Feature Type                         Description
    ----------------------------------------------------
    0         string    Type representing string values.
    >>> series.ads.feature_type = ['string', Ordinal, Tag('abc')]
    >>> series.ads.feature_type
    ['string', 'ordinal', 'abc']
    >>> series1 = series.dropna()
    >>> series1.ads.sync(series)
    >>> series1.ads.feature_type
    ['string', 'ordinal', 'abc']
"""

import inspect
import logging
from typing import List, Union

import pandas as pd
from ads.feature_engineering.accessor.mixin.eda_mixin_series import EDAMixinSeries
from ads.feature_engineering.accessor.mixin.feature_types_mixin import (
    ADSFeatureTypesMixin,
)
from ads.feature_engineering.exceptions import TypeNotFound
from ads.feature_engineering.feature_type.base import FeatureType, Tag
from ads.feature_engineering.feature_type.handler.feature_validator import (
    FeatureValidator,
)
from ads.feature_engineering.feature_type.adsstring.string import ADSString

logger = logging.getLogger(__name__)


[docs]class ADSSeriesValidator: """Class helper to invoke registerred validator on a series level.""" def __init__(self, feature_type_list: List[FeatureType], series: pd.Series) -> None: """Initializes ADS series validator. Parameters ---------- feature_type_list : List[FeatureType] The list of feature types. series : `pd.Series` The pandas series. """ self._feature_type_list = feature_type_list self._series = series def __getattr__(self, attr): """Makes it possible to invoke registered validators as a regular method.""" for feature_type in self._feature_type_list: if hasattr(feature_type.validator, attr): feature_type.validator._bind_data(self._series) return getattr(feature_type.validator, attr) raise AttributeError(attr)
[docs]@pd.api.extensions.register_series_accessor("ads") class ADSSeriesAccessor(ADSFeatureTypesMixin, EDAMixinSeries): """ADS accessor for Pandas Series. Attributes ---------- name: str The name of Series. tags: List[str] The list of tags for the Series. Methods ------- help(self, prop: str = None) -> None Provids docstring for affordable methods and properties. sync(self, src: Union[pd.DataFrame, pd.Series]) -> None Syncs feature types of current series with that from src. Attributes ---------- default_type(self) -> str Gets the name of default feature type for the series. feature_type(self) -> List[str] Gets the list of registered feature types for the series. feature_type_description(self) -> pd.DataFrame Gets the list of registered feature types in a DataFrame format. Examples -------- >>> from ads.feature_engineering.accessor.series_accessor import ADSSeriesAccessor >>> from ads.feature_engineering.feature_type.string import String >>> from ads.feature_engineering.feature_type.ordinal import Ordinal >>> from ads.feature_engineering.feature_type.base import Tag >>> series = pd.Series(['name1', 'name2', 'name3']) >>> series.ads.default_type 'string' >>> series.ads.feature_type ['string'] >>> series.ads.feature_type_description Feature Type Description ---------------------------------------------------- 0 string Type representing string values. >>> series.ads.feature_type = ['string', Ordinal, Tag('abc')] >>> series.ads.feature_type ['string', 'ordinal', 'abc'] >>> series1 = series.dropna() >>> series1.ads.sync(series) >>> series1.ads.feature_type ['string', 'ordinal', 'abc'] """ def __init__(self, pandas_obj: pd.Series) -> None: """Initializes ADS Pandas Series Accessor. Parameters ---------- pandas_obj : `pd.Series` The pandas series """ self._obj = pandas_obj super().__init__() self._feature_type = [self._default_type] self.tags = [] self.name = self._obj.name @property def _default_type(self) -> FeatureType: """Gets default feature type for the series. Returns ------- FeatureType The default feature type for the series. """ return self._feature_type_by_dtype(self._obj.dtype) @property def default_type(self) -> str: """Gets the name of default feature type for the series. Returns ------- str The name of default feature type. """ return self._default_type.name @property def feature_type(self) -> List[str]: """Gets the list of registered feature types for the series. Returns ------- List[str] Names of feature types. Examples -------- >>> series = pd.Series(['name1']) >>> series.ads.feature_type = ['name', 'string', Tag('tag for name')] >>> series.ads.feature_type ['name', 'string', 'tag for name'] """ types = [] for feature_type in self._feature_type: types.append(feature_type.name) return types + self.tags @property def feature_type_description(self) -> pd.DataFrame: """Gets the list of registered feature types in a DataFrame format. Returns ------- pd.DataFrame The DataFrame with feature types for this series. Examples -------- >>> series = pd.Series(['name1']) >>> series.ads.feature_type = ['name', 'string', Tag('Name tag')] >>> series.ads.feature_type_description Feature Type Description ---------------------------------------------------------- 0 name Type representing name values. 1 string Type representing string values. 2 Name tag Tag. """ feature_types = ( (feature_type.name, feature_type.description) for feature_type in self._feature_type if self._is_type_registered(feature_type) ) tags = ((tag, "Tag") for tag in self.tags) return pd.DataFrame( tuple(feature_types) + tuple(tags), columns=["Feature Type", "Description"] ) @feature_type.setter def feature_type(self, feature_types: List[Union[FeatureType, str, Tag]]) -> None: """Sets feature types for the series. Parameters ---------- feature_types : List[Union[FeatureType, str, Tag]] The list of feature types. Return ------ None Nothing. Raises ------ TypeError: If input data has wrong format. Examples -------- >>> series = pd.Series(['name1', 'name2', 'name3']) >>> series.ads.feature_type = ['name'] >>> series.feature_type ['name', 'string'] >>> series.ads.feature_type = ['string', 'name'] >>> series.feature_type ['string', 'name'] >>> series.ads.feature_type = [] >>> series.feature_type ['string'] """ if feature_types is None or not isinstance(feature_types, list): raise TypeError("Argument must be a list of feature types.") self._feature_type = [] self.tags = [] for feature_type in feature_types: self._add_feature_type(feature_type) default_feature_type = self._default_type if default_feature_type not in self._feature_type: self._add_feature_type(default_feature_type)
[docs] def sync(self, src: Union[pd.DataFrame, pd.Series]) -> None: """Syncs feature types of current series with that from src. The src could be a dataframe or a series. In either case, only columns with matched names are synced. Parameters ---------- src: (`pd.DataFrame` | `pd.Series`) The source to sync from. Returns ------- None Nothing. Examples -------- >>> series = pd.Series(['name1', 'name2', 'name3', None]) >>> series.ads.feature_type = ['name'] >>> series.ads.feature_type ['name', string] >>> series.dropna().ads.feature_type ['string'] >>> series1 = series.dropna() >>> series1.ads.sync(series) >>> series1.ads.feature_type ['name', 'string'] """ if isinstance(src, pd.DataFrame): if self._obj.name not in src.columns: logger.warning( "The source DataFrame doesn't have a clumn %s.", self._obj.name ) return self._sync(src[self._obj.name]) elif isinstance(src, pd.Series): self._sync(src)
def _sync(self, src: pd.Series) -> None: """Copies all feature types from src series to the current.""" new_feature_type = [ft for ft in src.ads._feature_type] new_tags = [Tag(tag) for tag in src.ads.tags] self.feature_type = new_feature_type + new_tags def _add_feature_type(self, feature_type: Union[FeatureType, str, Tag]) -> None: """Adds a feature type to the series. Parameters ---------- feature_type : Union[FeatureType, str, Tag] The feature type to add. Returns ------- None Nothing. """ if isinstance(feature_type, Tag): if feature_type.name in self.feature_type: logger.warning( "The tag '%s' is already added to the series '%s'.", feature_type.name, self.name, ) self.tags.append(feature_type.name) else: feature_type_cls = self._get_type(feature_type) if feature_type_cls.name in self.feature_type: logger.warning( "The type '%s' is already added to the series '%s'.", feature_type_cls.name, self.name, ) self._feature_type.append(feature_type_cls) def _remove_feature_type(self, feature_type: Union[FeatureType, str, Tag]) -> None: """Removes a feature type. Parameters ---------- feature_type : Union[FeatureType, str, Tag] feature type Raises ------ TypeNotFound Type does not exist for this series """ tag = self._get_tag(feature_type) if tag: if tag in self.tags: self.tags.remove(tag) return else: raise TypeNotFound(tag) feature_type_cls = self._get_type(feature_type) if feature_type_cls in self._feature_type: self._feature_type.remove(feature_type_cls) return raise TypeNotFound(feature_type_cls.__name__) def _get_tag(self, feature_type: Union[str, Tag]) -> str: if isinstance(feature_type, str) and feature_type in self.tags: return feature_type if isinstance(feature_type, Tag): return feature_type.name def __getattr__(self, attr): if attr == "validator": return ADSSeriesValidator(self._feature_type, self._obj) for feature_type in self._feature_type: if feature_type == ADSString: # We need to initialize first to use the plugins. if hasattr(feature_type("default"), attr): methods = inspect.classify_class_attrs(feature_type) for method in methods: if method.name == attr: if method.kind == "method": return lambda *args, **kwargs: [ getattr(ADSString(v), attr)(*args, **kwargs) for v in self._obj.values ] elif method.kind == "property": attr_objects = [] for v in self._obj.values: attr_objects.append(getattr(feature_type(v), attr)) return attr_objects if hasattr(feature_type, attr): # non-instance methods, aka class method or static method non_ins_methods = [ method.name for method in inspect.classify_class_attrs(feature_type) if method.kind == "class method" or method.kind == "static method" ] # instance method ins_methods = [ method.name for method in inspect.classify_class_attrs(feature_type) if method.kind == "method" ] attr_object = getattr(feature_type, attr) # if isinstance(attr_object, FeatureValidator): # attr_object._bind_data(self._obj) # there was one case that inspect could not track one of the function. # hence replace with __dict__. however, problem with __dict__ is that # it does not track back its ancestors functions. hence, use a union # to safeguard different scenarios. if (attr in non_ins_methods) or ( attr in feature_type.__dict__ and isinstance(feature_type.__dict__[attr], staticmethod) or isinstance(feature_type.__dict__[attr], classmethod) ): return lambda *args, **kwargs: attr_object( self._obj, *args, **kwargs ) elif attr in ins_methods: return lambda *args, **kwargs: getattr(feature_type(), attr)( self._obj, *args, **kwargs ) return attr_object raise AttributeError( f"'{self.__class__.__name__}' does not have an attribute {attr}." )