Source code for ads.dataset.target

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
from sklearn import preprocessing

from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)
from ads.common import utils
from ads.type_discovery.typed_feature import CategoricalTypedFeature
from ads.type_discovery.typed_feature import ContinuousTypedFeature
from ads.type_discovery.typed_feature import OrdinalTypedFeature

ABSOLUTE_SKEWNESS_THRESHOLD = 0.5
# between 0 and 1, ratio of least / most represented class
CLASS_IMBALANCE_THRESHOLD = 0.5


[docs]class TargetVariable:
    """
    This class provides target specific APIs.
    """

    @runtime_dependency(module="scipy", install_from=OptionalDependency.VIZ)
    def __init__(self, sampled_ds, target, target_type):
        from scipy.stats import skew

        self.sampled_ds = sampled_ds
        self.name = target
        self.type = target_type
        self.target_vals = None
        if isinstance(self.type, CategoricalTypedFeature):
            self.target_vals = self.sampled_ds.sampled_df[target].unique().tolist()
            val_counts = self.sampled_ds.sampled_df[target].value_counts()
            self.class_imbalance_ratio = val_counts.min() / val_counts.max()
        elif isinstance(self.type, ContinuousTypedFeature):
            try:
                self.skewness = np.abs(self.sampled_ds.sampled_df[target].skew())
            except TypeError as e:
                self.skewness = np.abs(
                    self.sampled_ds.sampled_df[target].astype("float").skew()
                )
        else:
            # can also be DateTimeTypedFeature IPAddressTypedFeature PhoneNumberTypedFeature GISTypedFeature
            # AddressTypedFeature DocumentTypedFeature ZipcodeTypedFeature UnknownTypedFeature ConstantTypedFeature
            # CreditCardTypedFeature OrdinalTypedFeature
            self.skewness = None
        self.numeric_columns = (
            self.sampled_ds.sampled_df._get_numeric_data().columns.values
        )

[docs]    def show_in_notebook(self, feature_names=None):  # pragma: no cover
        """
        Plot target distribution or target versus feature relation.

        Parameters
        ----------
        feature_names: list, Optional
            Plot target against a list of features.
            Display target distribution if feature_names is not provided.
        """
        if not utils.is_notebook():
            print("show_in_notebook called but not in notebook environment")
            return

        verbose = True
        if feature_names is not None:
            for feature_name in feature_names:
                self.sampled_ds.plot(
                    feature_name, self.name, verbose=verbose
                ).show_in_notebook()
                verbose = False
        else:
            self.sampled_ds.plot(self.name, verbose=False).show_in_notebook()

[docs]    def is_balanced(
        self,
        skewness_threshold=ABSOLUTE_SKEWNESS_THRESHOLD,
        class_imbalance_threshold=CLASS_IMBALANCE_THRESHOLD,
    ):
        """
        Returns True if the target is balanced, False otherwise.

        Returns
        -------
        is_balanced: bool
        """
        if isinstance(self.type, ContinuousTypedFeature):
            return self.skewness < skewness_threshold
        elif isinstance(self.type, CategoricalTypedFeature):
            return self.class_imbalance_ratio > class_imbalance_threshold
        else:
            return True