#!/usr/bin/env python
# -*- coding: utf-8; -*-
# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import pandas as pd
import warnings
from ads.common import utils, logger
from ads.dataset import helper
from ads.dataset.exception import ValidationError
from ads.dataset.dataset_with_target import ADSDatasetWithTarget
from sklearn.preprocessing import FunctionTransformer
from ads.dataset.helper import deprecate_variable, deprecate_default_value
[docs]class ClassificationDataset(ADSDatasetWithTarget):
"""
Dataset for classification task
"""
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
ADSDatasetWithTarget.__init__(
self, df=df, sampled_df=sampled_df, target=target, target_type=target_type, shape=shape, **kwargs
)
[docs] def convert_to_text_classification(self, text_column: str):
"""
Builds a new dataset with the given text column as the only feature besides target.
Parameters
----------
text_column: str
Feature name to use for text classification task
Returns
-------
ds: TextClassificationDataset
Dataset with one text feature and a classification target
Examples
--------
>>> review_ds = DatasetFactory.open("review_data.csv")
>>> ds_text_class = review_ds.convert_to_text_classification('reviews')
"""
def _select_features(df, feature_names, target):
if target in df.columns:
feature_names = feature_names + [target]
return df[feature_names]
transformer = (
f"convert_to_text_classification using feature {text_column}",
FunctionTransformer(
func=_select_features,
validate=False,
kw_args={
"feature_names": [self.target.name, text_column],
"target": self.target.name,
},
).fit(self.sampled_df),
)
if utils.is_same_class(self, BinaryClassificationDataset):
new_ds = BinaryTextClassificationDataset(
self.df[[self.target.name, text_column]],
self.sampled_df[[self.target.name, text_column]],
self.target.name,
self.target.type,
(len(self.df), 2),
**self.init_kwargs,
)
else:
new_ds = MultiClassTextClassificationDataset(
self.df[[self.target.name, text_column]],
self.sampled_df[[self.target.name, text_column]],
self.target.name,
self.target.type,
(len(self.df), 2),
**self.init_kwargs,
)
new_ds.transformer_pipeline = self._update_transformer_pipeline(transformer)
return new_ds
[docs] def down_sample(self, sampler=None):
"""
Fixes an imbalanced dataset by down-sampling.
Parameters
----------
sampler: An instance of SamplerMixin
Should implement fit_resample(X,y) method. If None, does random down sampling.
Returns
-------
down_sampled_ds: ClassificationDataset
A down-sampled dataset.
Examples
--------
>>> ds = DatasetFactory.open("some_data.csv")
>>> ds_balanced_small = ds.down_sample()
"""
return self._build_new_dataset(
helper.down_sample(self.df, self.target.name)
if sampler is None
else helper.sample(
sampler,
self.df.drop(self.target.name, axis=1),
self.df[self.target.name],
)
)
[docs] def up_sample(self, sampler="default"):
"""
Fixes imbalanced dataset by up-sampling
Parameters
----------
sampler: An instance of SamplerMixin
Should implement fit_resample(X,y) method.
If 'default', either SMOTE or random sampler will be used
fill_missing_type: a string
Can either be 'mean', 'mode' or 'median'.
Returns
-------
up_sampled_ds: ClassificationDataset
an up-sampled dataset
Examples
--------
>>> ds = DatasetFactory.open("some_data.csv")
>>> ds_balanced_large = ds.up_sample()
"""
return self._build_new_dataset(
helper.up_sample(
self.df,
self.target.name,
sampler=sampler,
feature_types=self.feature_types,
)
)
[docs]class BinaryClassificationDataset(ClassificationDataset):
"""
Dataset for binary classification
"""
def __init__(
self, df, sampled_df, target, target_type, shape, positive_class=None, **kwargs
):
if positive_class is not None:
# map positive_class to True
update_arg = lambda x: x == positive_class
def mapper(df, column_name, arg):
df[column_name] = df[column_name].map(arg)
return df
df = mapper(df, target, update_arg)
sampled_df = mapper(sampled_df, target, update_arg)
ClassificationDataset.__init__(
self, df, sampled_df, target, target_type, shape, **kwargs
)
[docs] def set_positive_class(self, positive_class, missing_value=False):
"""
Return new dataset with values in target column mapped to True or False
in accordance with the specified positive label.
Parameters
----------
positive_class : same dtype as target
The target label which should be identified as positive outcome from model.
missing_value : bool
missing values will be converted to this
Returns
-------
dataset: same type as the caller
Raises
------
ValidationError
if the positive_class is not present in target
Examples
--------
>>> ds = DatasetFactory.open("iris.csv")
>>> ds_with_target = ds.set_target('class')
>>> ds_with_pos_class = ds.set_positive_class('setosa')
"""
if positive_class not in self.target.target_vals:
raise ValidationError(
"Positive label '%s' not in target values '%s'"
% (positive_class, self.target.target_vals)
)
return self.assign_column(
self.target.name,
lambda x: pd.isnull(x) and missing_value or x == positive_class,
)
[docs]class MultiClassClassificationDataset(ClassificationDataset):
"""
Dataset for multi-class classification
"""
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
ClassificationDataset.__init__(
self, df, sampled_df, target, target_type, shape, **kwargs
)
[docs]class BinaryTextClassificationDataset(BinaryClassificationDataset):
"""
Dataset for binary text classification
"""
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
BinaryClassificationDataset.__init__(
self, df, sampled_df, target, target_type, shape, **kwargs
)
[docs] def auto_transform(self):
"""
Automatically chooses the most effective dataset transformation
"""
logger.info("No optimizations.")
return self
[docs] def select_best_features(self, score_func=None, k=12):
"""
Automatically chooses the best features and removes the rest
"""
logger.info(
"There are an insufficient number of features to do feature selection."
)
return self
[docs]class MultiClassTextClassificationDataset(MultiClassClassificationDataset):
"""
Dataset for multi-class text classification
"""
def __init__(self, df, sampled_df, target, target_type, shape, **kwargs):
MultiClassClassificationDataset.__init__(
self, df, sampled_df, target, target_type, shape, **kwargs
)
[docs] def auto_transform(self):
"""
Automatically chooses the most effective dataset transformation
"""
logger.info("No optimizations.")
return self
[docs] def select_best_features(self, score_func=None, k=12):
"""
Automatically chooses the best features and removes the rest
"""
logger.info(
"There are an insufficient number of features to do feature selection."
)
return self