Source code for ads.dataset.target

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
from sklearn import preprocessing

from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)
from ads.common import utils
from ads.type_discovery.typed_feature import CategoricalTypedFeature
from ads.type_discovery.typed_feature import ContinuousTypedFeature
from ads.type_discovery.typed_feature import OrdinalTypedFeature

ABSOLUTE_SKEWNESS_THRESHOLD = 0.5
# between 0 and 1, ratio of least / most represented class
CLASS_IMBALANCE_THRESHOLD = 0.5


[docs] class TargetVariable: """ This class provides target specific APIs. """ @runtime_dependency(module="scipy", install_from=OptionalDependency.VIZ) def __init__(self, sampled_ds, target, target_type): from scipy.stats import skew self.sampled_ds = sampled_ds self.name = target self.type = target_type self.target_vals = None if isinstance(self.type, CategoricalTypedFeature): self.target_vals = self.sampled_ds.sampled_df[target].unique().tolist() val_counts = self.sampled_ds.sampled_df[target].value_counts() self.class_imbalance_ratio = val_counts.min() / val_counts.max() elif isinstance(self.type, ContinuousTypedFeature): try: self.skewness = np.abs(self.sampled_ds.sampled_df[target].skew()) except TypeError as e: self.skewness = np.abs( self.sampled_ds.sampled_df[target].astype("float").skew() ) else: # can also be DateTimeTypedFeature IPAddressTypedFeature PhoneNumberTypedFeature GISTypedFeature # AddressTypedFeature DocumentTypedFeature ZipcodeTypedFeature UnknownTypedFeature ConstantTypedFeature # CreditCardTypedFeature OrdinalTypedFeature self.skewness = None self.numeric_columns = ( self.sampled_ds.sampled_df._get_numeric_data().columns.values )
[docs] def show_in_notebook(self, feature_names=None): # pragma: no cover """ Plot target distribution or target versus feature relation. Parameters ---------- feature_names: list, Optional Plot target against a list of features. Display target distribution if feature_names is not provided. """ if not utils.is_notebook(): print("show_in_notebook called but not in notebook environment") return verbose = True if feature_names is not None: for feature_name in feature_names: self.sampled_ds.plot( feature_name, self.name, verbose=verbose ).show_in_notebook() verbose = False else: self.sampled_ds.plot(self.name, verbose=False).show_in_notebook()
[docs] def is_balanced( self, skewness_threshold=ABSOLUTE_SKEWNESS_THRESHOLD, class_imbalance_threshold=CLASS_IMBALANCE_THRESHOLD, ): """ Returns True if the target is balanced, False otherwise. Returns ------- is_balanced: bool """ if isinstance(self.type, ContinuousTypedFeature): return self.skewness < skewness_threshold elif isinstance(self.type, CategoricalTypedFeature): return self.class_imbalance_ratio > class_imbalance_threshold else: return True