Source code for ads.dataset.classification_dataset

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import pandas as pd
import warnings

from ads.common import utils, logger
from ads.dataset import helper
from ads.dataset.exception import ValidationError
from ads.dataset.dataset_with_target import ADSDatasetWithTarget
from sklearn.preprocessing import FunctionTransformer
from ads.dataset.helper import deprecate_variable, deprecate_default_value


[docs] class ClassificationDataset(ADSDatasetWithTarget): """ Dataset for classification task """ def __init__(self, df, sampled_df, target, target_type, shape, **kwargs): ADSDatasetWithTarget.__init__( self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs )
[docs] def auto_transform( self, fix_imbalance: bool = True, correlation_threshold: float = 0.7, frac: float = 1.0, correlation_methods: str = "pearson", ): """ Return transformed dataset with several optimizations applied automatically. The optimizations include: - Dropping constant and primary key columns, which has no predictive quality, - Imputation, to fill in missing values in noisy data: - For continuous variables, fill with mean if less than 40% is missing, else drop, - For categorical variables, fill with most frequent if less than 40% is missing, else drop, - Dropping strongly co-correlated columns that tend to produce less generalizable models, - Balancing dataset using up or down sampling. Parameters ---------- fix_imbalance : bool, defaults to True. Fix imbalance between classes in dataset. Used only for classification datasets. correlation_threshold: float, defaults to 0.7. It must be between 0 and 1, inclusive. The correlation threshold where columns with correlation higher than the threshold will be considered as strongly co-correlated and recommended to be taken care of. frac: float, defaults to 1.0. Range -> (0, 1]. What fraction of the data should be used in the calculation? correlation_methods: Union[list, str], defaults to 'pearson'. - 'pearson': Use Pearson's Correlation between continuous features, - 'cramers v': Use Cramer's V correlations between categorical features, - 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features, - 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio']. Or a list containing any combination of these methods, for example, ['pearson', 'cramers v']. Returns ------- transformed_dataset : ADSDatasetWithTarget The dataset after transformation Examples -------- >>> ds_clean = ds.auto_transform(correlation_threshold=0.6) """ frac = deprecate_default_value( frac, None, 1, f"<code>frac=None</code> is deprecated. Use <code>frac=1.0</code> instead.", FutureWarning, ) with utils.get_progress_bar(7) as progress: df, sampled_df, transformer_pipeline = self._transform( progress=progress, fix_imbalance=fix_imbalance, correlation_threshold=correlation_threshold, frac=frac, correlation_methods=correlation_methods, ) return self._build_new_dataset( df, sampled_df=sampled_df, transformers=transformer_pipeline.steps, progress=progress, )
[docs] def convert_to_text_classification(self, text_column: str): """ Builds a new dataset with the given text column as the only feature besides target. Parameters ---------- text_column: str Feature name to use for text classification task Returns ------- ds: TextClassificationDataset Dataset with one text feature and a classification target Examples -------- >>> review_ds = DatasetFactory.open("review_data.csv") >>> ds_text_class = review_ds.convert_to_text_classification('reviews') """ def _select_features(df, feature_names, target): if target in df.columns: feature_names = feature_names + [target] return df[feature_names] transformer = ( f"convert_to_text_classification using feature {text_column}", FunctionTransformer( func=_select_features, validate=False, kw_args={ "feature_names": [self.target.name, text_column], "target": self.target.name, }, ).fit(self.sampled_df), ) if utils.is_same_class(self, BinaryClassificationDataset): new_ds = BinaryTextClassificationDataset( self.df[[self.target.name, text_column]], self.sampled_df[[self.target.name, text_column]], self.target.name, self.target.type, (len(self.df), 2), **self.init_kwargs, ) else: new_ds = MultiClassTextClassificationDataset( self.df[[self.target.name, text_column]], self.sampled_df[[self.target.name, text_column]], self.target.name, self.target.type, (len(self.df), 2), **self.init_kwargs, ) new_ds.transformer_pipeline = self._update_transformer_pipeline(transformer) return new_ds
[docs] def down_sample(self, sampler=None): """ Fixes an imbalanced dataset by down-sampling. Parameters ---------- sampler: An instance of SamplerMixin Should implement fit_resample(X,y) method. If None, does random down sampling. Returns ------- down_sampled_ds: ClassificationDataset A down-sampled dataset. Examples -------- >>> ds = DatasetFactory.open("some_data.csv") >>> ds_balanced_small = ds.down_sample() """ return self._build_new_dataset( helper.down_sample(self.df, self.target.name) if sampler is None else helper.sample( sampler, self.df.drop(self.target.name, axis=1), self.df[self.target.name], ) )
[docs] def up_sample(self, sampler="default"): """ Fixes imbalanced dataset by up-sampling Parameters ---------- sampler: An instance of SamplerMixin Should implement fit_resample(X,y) method. If 'default', either SMOTE or random sampler will be used fill_missing_type: a string Can either be 'mean', 'mode' or 'median'. Returns ------- up_sampled_ds: ClassificationDataset an up-sampled dataset Examples -------- >>> ds = DatasetFactory.open("some_data.csv") >>> ds_balanced_large = ds.up_sample() """ return self._build_new_dataset( helper.up_sample( self.df, self.target.name, sampler=sampler, feature_types=self.feature_types, ) )
[docs] class BinaryClassificationDataset(ClassificationDataset): """ Dataset for binary classification """ def __init__( self, df, sampled_df, target, target_type, shape, positive_class=None, **kwargs ): if positive_class is not None: # map positive_class to True update_arg = lambda x: x == positive_class def mapper(df, column_name, arg): df[column_name] = df[column_name].map(arg) return df df = mapper(df, target, update_arg) sampled_df = mapper(sampled_df, target, update_arg) ClassificationDataset.__init__( self, df, sampled_df, target, target_type, shape, **kwargs )
[docs] def set_positive_class(self, positive_class, missing_value=False): """ Return new dataset with values in target column mapped to True or False in accordance with the specified positive label. Parameters ---------- positive_class : same dtype as target The target label which should be identified as positive outcome from model. missing_value : bool missing values will be converted to this Returns ------- dataset: same type as the caller Raises ------ ValidationError if the positive_class is not present in target Examples -------- >>> ds = DatasetFactory.open("iris.csv") >>> ds_with_target = ds.set_target('class') >>> ds_with_pos_class = ds.set_positive_class('setosa') """ if positive_class not in self.target.target_vals: raise ValidationError( "Positive label '%s' not in target values '%s'" % (positive_class, self.target.target_vals) ) return self.assign_column( self.target.name, lambda x: pd.isnull(x) and missing_value or x == positive_class, )
[docs] class MultiClassClassificationDataset(ClassificationDataset): """ Dataset for multi-class classification """ def __init__(self, df, sampled_df, target, target_type, shape, **kwargs): ClassificationDataset.__init__( self, df, sampled_df, target, target_type, shape, **kwargs )
[docs] class BinaryTextClassificationDataset(BinaryClassificationDataset): """ Dataset for binary text classification """ def __init__(self, df, sampled_df, target, target_type, shape, **kwargs): BinaryClassificationDataset.__init__( self, df, sampled_df, target, target_type, shape, **kwargs )
[docs] def auto_transform(self): """ Automatically chooses the most effective dataset transformation """ logger.info("No optimizations.") return self
[docs] def select_best_features(self, score_func=None, k=12): """ Automatically chooses the best features and removes the rest """ logger.info( "There are an insufficient number of features to do feature selection." ) return self
[docs] class MultiClassTextClassificationDataset(MultiClassClassificationDataset): """ Dataset for multi-class text classification """ def __init__(self, df, sampled_df, target, target_type, shape, **kwargs): MultiClassClassificationDataset.__init__( self, df, sampled_df, target, target_type, shape, **kwargs )
[docs] def auto_transform(self): """ Automatically chooses the most effective dataset transformation """ logger.info("No optimizations.") return self
[docs] def select_best_features(self, score_func=None, k=12): """ Automatically chooses the best features and removes the rest """ logger.info( "There are an insufficient number of features to do feature selection." ) return self