Source code for ads.dataset.dataset_with_target

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from __future__ import absolute_import, print_function

import abc
import importlib
from collections import defaultdict
from numbers import Number
from typing import Tuple, Union

import pandas as pd
from ads.common import utils, logger
from ads.common.data import ADSData
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)
from ads.dataset import helper
from ads.dataset.dataset import ADSDataset
from ads.dataset.feature_engineering_transformer import FeatureEngineeringTransformer
from ads.dataset.feature_selection import FeatureImportance
from ads.dataset.helper import (
    DatasetDefaults,
    deprecate_default_value, 
    deprecate_variable, 
    generate_sample,
    get_target_type,
    is_text_data,
)
from ads.dataset.label_encoder import DataFrameLabelEncoder
from ads.dataset.pipeline import TransformerPipeline
from ads.dataset.progress import DummyProgressBar
from ads.dataset.recommendation import Recommendation
from ads.dataset.recommendation_transformer import RecommendationTransformer
from ads.dataset.target import TargetVariable
from ads.type_discovery.typed_feature import (
    CategoricalTypedFeature,
    ContinuousTypedFeature,
    DocumentTypedFeature,
    GISTypedFeature,
    OrdinalTypedFeature,
    TypedFeature,
    DateTimeTypedFeature, 
    TypedFeature
)
from sklearn.model_selection import train_test_split
from pandas.io.formats.printing import pprint_thing
from sklearn.preprocessing import FunctionTransformer
from abc import ABCMeta



[docs]
class ADSDatasetWithTarget(ADSDataset, metaclass=ABCMeta):
    """
    This class provides APIs for preparing dataset for modeling.
    """

    def __init__(
        self,
        df,
        target,
        sampled_df=None,
        shape=None,
        target_type=None,
        sample_max_rows=-1,
        type_discovery=True,
        types={},
        parent=None,
        name="",
        metadata=None,
        transformer_pipeline=None,
        description=None,
        progress=DummyProgressBar(),
        **kwargs,
    ):
        self.recommendation_transformer = None
        if shape is None:
            shape = df.shape
        if sampled_df is None:
            sampled_df = generate_sample(
                df,
                shape[0],
                DatasetDefaults.sampling_confidence_level,
                DatasetDefaults.sampling_confidence_interval,
                **kwargs,
            )

        if parent is None:
            cols = sampled_df.columns.tolist()
            cols.insert(0, cols.pop(cols.index(target)))
            ADSDataset.__init__(
                self,
                df,
                sampled_df[[*cols]],
                shape,
                name=name,
                description=description,
                type_discovery=type_discovery,
                types=types,
                progress=progress,
                metadata=metadata,
                transformer_pipeline=transformer_pipeline,
                sample_max_rows=sample_max_rows,
            )
        else:
            self.__dict__ = parent.__dict__.copy()
            cols = self.sampled_df.columns.tolist()
            cols.insert(0, cols.pop(cols.index(target)))

            self.sampled_df = parent.sampled_df[[*cols]]

            # if parent has already been built, just reorder the columns to display the plot for target at beginning
            if parent.correlation is None:
                self.corr_futures = parent.corr_futures
            else:
                corr_cols = parent.sampled_df.select_dtypes(
                    exclude=["object"]
                ).columns.values.tolist()
                corr_cols.insert(0, corr_cols.pop(corr_cols.index(target)))
                self.correlation = parent.correlation.reindex(corr_cols)[[corr_cols]]
            self.feature_types = parent.feature_types
            self.feature_dist_html_dict = {}
            if len(parent.feature_dist_html_dict) > 0:
                parent_feature_dist_html_dict = parent.feature_dist_html_dict.copy()
                self.feature_dist_html_dict = {
                    target: parent_feature_dist_html_dict.pop(target)
                }
                self.feature_dist_html_dict.update(parent_feature_dist_html_dict)

        # drop all rows where target is nan
        target = target.strip().replace(" ", "_")

        #
        # as an optimization only dropna and regenerate sample when the target
        # has na values
        #

        if self.df[target].isna().sum():
            #
            # remove rows for which the target is null
            #
            self.df = self.df.dropna(subset=[target])

            #
            # we cannot simply drop null values from the sampled_df after a change
            # to the df - we must rebuild the sample from the new df
            #
            self.sampled_df = helper.generate_sample(
                self.df,
                sampled_df.shape[0],
                helper.DatasetDefaults.sampling_confidence_level,
                helper.DatasetDefaults.sampling_confidence_interval,
            )
            #
            # after regenerating the sample we need to move the target back to the head
            #
            cols = self.sampled_df.columns.tolist()
            cols.insert(0, cols.pop(cols.index(target)))
            self.sampled_df = self.sampled_df[[*cols]]

        if target_type is None:
            target_type = get_target_type(target, sampled_df, **kwargs)
        self.target = TargetVariable(self, target, target_type)

        # remove target from type discovery conversion
        for step in self.transformer_pipeline.steps:
            if (
                step[0] == "type_discovery"
                and self.target.name in step[1].kw_args["dtypes"]
            ):
                step[1].kw_args["dtypes"].pop(self.target.name)


[docs]
    @staticmethod
    def from_dataframe(
        df: pd.DataFrame,
        target: str,
        sampled_df: pd.DataFrame = None,
        shape: Tuple[int, int] = None,
        target_type: TypedFeature = None,
        positive_class=None,
        **init_kwargs,
    ):
        from ads.dataset.classification_dataset import (
            BinaryClassificationDataset, 
            BinaryTextClassificationDataset, 
            MultiClassClassificationDataset, 
            MultiClassTextClassificationDataset
        )
        from ads.dataset.forecasting_dataset import ForecastingDataset
        from ads.dataset.regression_dataset import RegressionDataset

        if sampled_df is None:
            sampled_df = generate_sample(
                df,
                (shape or df.shape)[0],
                DatasetDefaults.sampling_confidence_level,
                DatasetDefaults.sampling_confidence_interval,
                **init_kwargs,
            )

        if target not in df:
            raise ValueError(
                f"{target} column doesn't exist in data frame. Specify a valid one instead."
            )
            
        if target_type is None:
            target_type = get_target_type(target, sampled_df, **init_kwargs)

        if len(df[target].dropna()) == 0:
            logger.warning(
                "It is not recommended to use an empty column as the target variable."
            )
            raise ValueError(
                f"We do not support using empty columns as the chosen target"
            )
        if utils.is_same_class(target_type, ContinuousTypedFeature):
            return RegressionDataset(
                df=df,
                sampled_df=sampled_df,
                target=target,
                target_type=target_type,
                shape=shape,
                **init_kwargs,
            )
        elif utils.is_same_class(
            target_type, DateTimeTypedFeature
        ) or df.index.dtype.name.startswith("datetime"):
            return ForecastingDataset(
                df=df,
                sampled_df=sampled_df,
                target=target,
                target_type=target_type,
                shape=shape,
                **init_kwargs,
            )

        # Adding ordinal typed feature, but ultimately we should rethink how we want to model this type
        elif utils.is_same_class(target_type, CategoricalTypedFeature) or utils.is_same_class(
            target_type, OrdinalTypedFeature
        ):
            if target_type.meta_data["internal"]["unique"] == 2:
                if is_text_data(sampled_df, target):
                    return BinaryTextClassificationDataset(
                        df=df,
                        sampled_df=sampled_df,
                        target=target,
                        shape=shape,
                        target_type=target_type,
                        positive_class=positive_class,
                        **init_kwargs,
                    )

                return BinaryClassificationDataset(
                    df=df,
                    sampled_df=sampled_df,
                    target=target,
                    shape=shape,
                    target_type=target_type,
                    positive_class=positive_class,
                    **init_kwargs,
                )
            else:
                if is_text_data(sampled_df, target):
                    return MultiClassTextClassificationDataset(
                        df=df,
                        sampled_df=sampled_df,
                        target=target,
                        target_type=target_type,
                        shape=shape,
                        **init_kwargs,
                    )
                return MultiClassClassificationDataset(
                    df=df,
                    sampled_df=sampled_df,
                    target=target,
                    target_type=target_type,
                    shape=shape,
                    **init_kwargs,
                )
        elif (
            utils.is_same_class(target, DocumentTypedFeature)
            or "text" in target_type["type"]
            or "text" in target
        ):
            raise ValueError(
                f"The column {target} cannot be used as the target column."
            )
        elif (
            utils.is_same_class(target_type, GISTypedFeature)
            or "coord" in target_type["type"]
            or "coord" in target
        ):
            raise ValueError(
                f"The column {target} cannot be used as the target column."
            )
        # This is to catch constant columns that are boolean. Added as a fix for pd.isnull(), and datasets with a
        #   binary target, but only data on one instance
        elif target_type and target_type["low_level_type"] == "bool":
            return BinaryClassificationDataset(
                df=df,
                sampled_df=sampled_df,
                target=target,
                shape=shape,
                target_type=target_type,
                positive_class=positive_class,
                **init_kwargs,
            )
        raise ValueError(
            f"Unable to identify problem type. Specify the data type of {target} using 'types'. "
            f"For example, types = {{{target}: 'category'}}"
        )



[docs]
    def rename_columns(self, columns):
        """
        Returns a dataset with columns renamed.
        """
        if isinstance(columns, list):
            assert len(columns) == len(
                self.columns.values
            ), "columns length do not match the dataset"
            columns = dict(zip(self.columns.values, columns))
        assert isinstance(columns, dict)
        new_target = None
        if self.target.name in columns:
            new_target = columns[self.target.name]
        return self.rename(columns=columns, _new_target=new_target)



[docs]
    def select_best_features(self, score_func=None, k=12):
        """
        Return new dataset containing only the top k features.

        Parameters
        ----------
        k: int, default 12
            The top 'k' features to select.
        score_func: function
            Scoring function to use to rank the features. This scoring function should take a 2d array X(features)
            and an array like y(target) and return a numeric score for each feature in the same order as X.

        Notes
        -----
        See also https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_regression.html
        and https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.f_classif.html

        Examples
        --------
        >>> ds = DatasetBrowser("sklearn").open("iris")
        >>> ds_small = ds.select_best_features(k=2)
        """
        tf = self._get_best_features_transformer(score_func=score_func, k=k)
        return self._build_new_dataset(
            tf[1].transform(self.df), tf[1].transform(self.sampled_df), transformers=tf
        )



[docs]
    def auto_transform(
        self,
        correlation_threshold: float = 0.7,
        frac: float = 1.0,
        sample_size=1.0,
        correlation_methods: Union[str, list] = "pearson",
    ):
        """
        Return transformed dataset with several optimizations applied automatically.
        The optimizations include:

        - Dropping constant and primary key columns, which has no predictive quality,
        - Imputation, to fill in missing values in noisy data:

            - For continuous variables, fill with mean if less than 40% is missing, else drop,
            - For categorical variables, fill with most frequent if less than 40% is missing, else drop,

        - Dropping strongly co-correlated columns that tend to produce less generalizable models.

        Parameters
        ----------
        correlation_threshold: float, defaults to 0.7. It must be between 0 and 1, inclusive
            the correlation threshold where columns with correlation higher than the threshold will
            be considered as strongly co-correlated and recommended to be taken care of.
        frac: Is superseded by sample_size
        sample_size: float, defaults to 1.0. Float, Range -> (0, 1]
            What fraction of the data should be used in the calculation?
        correlation_methods: Union[list, str], defaults to 'pearson'

            - 'pearson': Use Pearson's Correlation between continuous features,
            - 'cramers v': Use Cramer's V correlations between categorical features,
            - 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
            - 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].

            Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].

        Returns
        -------
        transformed_dataset : ADSDatasetWithTarget

        Examples
        --------
        >>> ds_clean = ds.auto_transform()
        """
        frac = deprecate_default_value(
            frac,
            None,
            1,
            "<code>frac=None</code> is deprecated. Use <code>sample_size=1.0</code> instead.",
            FutureWarning,
        )

        if frac != 1.0:
            deprecate_frac = deprecate_variable(
                frac,
                sample_size,
                "<code>frac</code> is superseded by <code>sample_size</code>.",
                DeprecationWarning,
            )
            if sample_size == 1.0:
                sample_size = deprecate_frac

        if correlation_threshold > 1 or correlation_threshold < 0:
            raise AssertionError("correlation_threshold has to be between 0 and 1.")
        with utils.get_progress_bar(5) as progress:
            df, sampled_df, transformer_pipeline = self._transform(
                progress=progress,
                correlation_threshold=correlation_threshold,
                frac=sample_size,
                correlation_methods=correlation_methods,
            )
            return self._build_new_dataset(
                df,
                sampled_df=sampled_df,
                transformers=transformer_pipeline.steps,
                progress=progress,
            )



[docs]
    def visualize_transforms(self):
        """
        Render a representation of the dataset's transform DAG.
        """

        helper.visualize_transformation(
            self.transformer_pipeline,
            text="- rows: {}\\l- columns: {}\\l".format(*self.shape),
        )


    def _suggested_code(self, action, recommendation_type, variable):
        if action == "Drop":
            return ".drop_columns([{}])".format('"' + variable + '"')
        if action == "Do nothing":
            return ""
        if "Drop " in action:
            return ".drop_columns([{}])".format('"' + action.split(" ")[1] + '"')
        if action == "Down-sample":
            return ".down_sample()"
        if action == "Up-sample":
            if importlib.util.find_spec("imblearn") is None:
                return ".up_sample(sampler='default') \\n `pip install imbalanced-learn` to use default up-sampler."
            else:
                return ".up_sample(sampler='default')"
        if recommendation_type == "positive_class" and action != "Do nothing":
            return ".set_positive_class({}, missing_value=False)".format(
                '"' + action + '"'
            )
        if recommendation_type == "imputation":
            fill_val = helper.get_fill_val(
                self.feature_types, variable, action, constant="constant"
            )

            fill_val = (
                fill_val if isinstance(fill_val, Number) else '"' + fill_val + '"'
            )
            return ".fillna({}{}: {}{})".format(
                "{", '"' + variable + '"', fill_val, "}"
            )
        else:
            return ""


[docs]
    def suggest_recommendations(
        self,
        correlation_methods: Union[str, list] = "pearson",
        print_code: bool = True,
        correlation_threshold: float = 0.7,
        overwrite: bool = None,
        force_recompute: bool = False,
        frac: float = 1.0,
        sample_size: float = 1.0,
        **kwargs,
    ):
        """
        Returns a pandas dataframe with suggestions for dataset optimization. This includes:

        - Identifying constant and primary key columns, which has no predictive quality,
        - Imputation, to fill in missing values in noisy data:

            - For continuous variables, fill with mean if less than 40% is missing, else drop,
            - For categorical variables, fill with most frequent if less than 40% is missing, else drop,

        - Identifying strongly co-correlated columns that tend to produce less generalizable models,
        - Automatically balancing dataset for classification problems using up or down sampling.

        Parameters
        ----------
        correlation_methods: Union[list, str], default to 'pearson'

            - 'pearson': Use Pearson's Correlation between continuous features,
            - 'cramers v': Use Cramer's V correlations between categorical features,
            - 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
            - 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].

            Or a list containing any combination of these methods, for example, ['pearson', 'cramers v']
        print_code: bool, Defaults to True
            Print Python code for the suggested actions.
        correlation_threshold: float. Defaults to 0.7. It must be between 0 and 1, inclusive
            the correlation threshold where columns with correlation higher than the threshold will
            be considered as strongly co-correated and recommended to be taken care of.
        frac: Is superseded by sample_size
        sample_size: float, defaults to 1.0. Float, Range -> (0, 1]
            What fraction of the data should be used in the calculation?
        overwrite:
            Is deprecated and replaced by force_recompute.
        force_recompute: bool, default to be False

            - If False, it calculates the correlation matrix if there is no cached correlation matrix. Otherwise,
              it returns the cached correlation matrix.
            - If True, it calculates the correlation matrix regardless whether there is cached result or not.

        Returns
        -------
        suggestion dataframe : pandas.DataFrame

        Examples
        --------
        >>> suggestion_df = ds.suggest_recommendations(correlation_threshold=0.7)
        """
        frac = deprecate_default_value(
            frac,
            None,
            1,
            "<code>frac=None</code> is deprecated. Use <code>sample_size=1.0</code>.",
            FutureWarning,
        )

        if frac != 1.0:
            deprecate_frac = deprecate_variable(
                frac,
                sample_size,
                "<code>frac</code> is superseded by <code>sample_size</code>.",
                DeprecationWarning,
            )
            if sample_size == 1.0:
                sample_size = deprecate_frac

        force_recompute = deprecate_variable(
            overwrite,
            force_recompute,
            f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
            DeprecationWarning,
        )

        recommended = self._get_recommendations_transformer(
            auto_transform=False,
            correlation_threshold=correlation_threshold,
            correlation_methods=correlation_methods,
            force_recompute=force_recompute,
            frac=sample_size,
            **kwargs,
        ).fit(self.sampled_df)

        if len(recommended.reco_dict_) == 0:
            logger.info("No recommendations.")
            return pd.DataFrame()

        column_names = [
            "Message",
            "Variables",
            "Action",
            "Selected Action",
            "Recommendation Type",
        ]

        df_dict = defaultdict(list)

        for recommendation_type, column_dict in recommended.reco_dict_.items():
            if recommendation_type == "constant_column":
                n_constant = len(column_dict)
                df_dict["Recommendation Type"].extend(
                    [recommendation_type] * n_constant
                )
                df_dict["Variables"].extend(column_dict)
                df_dict["Message"].extend(["Constant Column"] * n_constant)
                df_dict["Action"].extend(["Drop"] * n_constant)
                df_dict["Selected Action"].extend(["Drop"] * n_constant)
                continue

            for column, details_dict in column_dict.items():
                max_length = len(details_dict["Action"])
                for key, value in details_dict.items():
                    if isinstance(value, list):
                        df_dict[key].extend(value)
                    else:
                        df_dict[key].extend([value] * max_length)
                df_dict["Recommendation Type"].extend(
                    [recommendation_type] * max_length
                )
                df_dict["Variables"].extend([column] * max_length)

        suggestions_df = pd.DataFrame.from_dict(df_dict)[column_names]
        suggestions_df["Code"] = suggestions_df.apply(
            lambda x: self._suggested_code(
                x["Action"], x["Recommendation Type"], x["Variables"]
            ),
            axis=1,
        )
        suggestion_df = (
            suggestions_df.drop(columns=["Recommendation Type"])
            .rename(columns={"Selected Action": "Suggested"})
            .set_index(["Message", "Variables", "Suggested", "Action"])
            .fillna("")
        )
        if print_code:
            columns_to_impute = {}
            columns_to_drop = []
            consolidated_code = ""
            suggestion_df_ = suggestion_df.reset_index()
            suggested_code = suggestion_df_.loc[
                suggestion_df_.Suggested == suggestion_df_.Action
            ].Code.unique()
            for code in suggested_code:
                if ".drop_columns" in code:
                    columns_to_drop.append(code.split("[")[1].split("]")[0][1:-1])
                elif ".fillna" in code:
                    impute_pair = code.split("{")[1].split("}")[0]
                    columns_to_impute[impute_pair.split(":")[0].replace('"', "")] = (
                        float(impute_pair.split(":")[1].strip())
                        if impute_pair.split(":")[1].strip().replace(".", "").isdigit()
                        else impute_pair.split(":")[1].strip().replace('"', "")
                    )
                else:
                    consolidated_code += code
            consolidated_code = (
                "No more!" if len(consolidated_code) == 0 else consolidated_code
            )

            logger.info(f"Suggested columns to drop: {columns_to_drop}.")
            logger.info(f"Suggested columns to impute: {columns_to_impute}.")
            logger.info(f"Others: {consolidated_code}.")

        return suggestion_df



[docs]
    @runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
    def get_recommendations(
        self,
        correlation_methods: str = "pearson",
        correlation_threshold: float = 0.7,
        frac: float = 1.0,
        sample_size: float = 1.0,
        overwrite: bool = None,
        force_recompute: bool = False,
        display_format: str = "widget",
    ):
        """
        Generate recommendations for dataset optimization. This includes:

        - Identifying constant and primary key columns, which has no predictive quality,
        - Imputation, to fill in missing values in noisy data:

            - For continuous variables, fill with mean if less than 40% is missing, else drop,
            - For categorical variables, fill with most frequent if less than 40% is missing, else drop,

        - Identifying strongly co-correlated columns that tend to produce less generalizable models,
        - Automatically balancing dataset for classification problems using up or down sampling.

        Parameters
        ----------
        correlation_methods: Union[list, str], default to 'pearson'

            - 'pearson': Use Pearson's Correlation between continuous features,
            - 'cramers v': Use Cramer's V correlations between categorical features,
            - 'correlation ratio': Use Correlation Ratio Correlation between categorical and continuous features,
            - 'all': Is equivalent to ['pearson', 'cramers v', 'correlation ratio'].

            Or a list containing any combination of these methods, for example, ['pearson', 'cramers v'].
        correlation_threshold: float, defaults to 0.7. It must be between 0 and 1, inclusive
            The correlation threshold where columns with correlation higher than the threshold will
            be considered as strongly co-correlated and recommended to be taken care of.
        frac: Is superseded by sample_size
        sample_size: float, defaults to 1.0. Float, Range -> (0, 1]
            What fraction of the data should be used in the calculation?
        overwrite:
            Is deprecated and replaced by force_recompute.
        force_recompute: bool, default to be False

            - If False, it calculates the correlation matrix if there is no cached correlation matrix. Otherwise,
              it returns the cached correlation matrix.
            - If True, it calculates the correlation matrix regardless whether there is cached result or not.

        display_format: string, defaults to 'widget'.
            Should be either 'widget' or 'table'. If 'widget',
            a GUI style interface is popped out; if 'table', a table of suggestions is shown.
        """
        frac = deprecate_default_value(
            frac,
            None,
            1,
            "<code>frac=None</code> is superseded by <code>sample_size=1.0</code>.",
            FutureWarning,
        )

        if frac != 1.0:
            deprecate_frac = deprecate_variable(
                frac,
                sample_size,
                "<code>frac</code> is superseded by <code>sample_size</code>.",
                DeprecationWarning,
            )
            if sample_size == 1.0:
                sample_size = deprecate_frac

        force_recompute = deprecate_variable(
            overwrite,
            force_recompute,
            f"<code>overwrite=None</code> is deprecated. Use <code>force_recompute</code> instead.",
            DeprecationWarning,
        )

        if display_format == "widget":
            recommended = Recommendation(
                self,
                self._get_recommendations_transformer(
                    auto_transform=False,
                    correlation_threshold=correlation_threshold,
                    correlation_methods=correlation_methods,
                    frac=sample_size,
                    force_recompute=force_recompute,
                ).fit(self.sampled_df),
            )

            if len(recommended.reco_dict) == 0:
                logger.info("No recommendations.")

            return recommended

        elif display_format == "table":
            df_suggestion = self.suggest_recommendations(
                correlation_threshold=correlation_threshold,
                frac=sample_size,
                force_recompute=force_recompute,
            )

            from IPython.display import HTML, display

            display(
                HTML(
                    df_suggestion.to_html()
                    .replace(" `", "<code>")
                    .replace("` ", "</code>")
                    .replace("\\n", "<br>")
                )
            )



[docs]
    def get_transformed_dataset(self):
        """
        Return the transformed dataset with the recommendations applied.

        This method should be called after applying the recommendations using the Recommendation#show_in_notebook() API.
        """
        if hasattr(self, "new_ds"):
            return self.new_ds
        logger.info(
            "Use `get_recommendations()` to view or update recommendation or `auto_tranform()` first."
        )
        logger.warning(
            "`get_transformed_dataset` is deprecated and will be removed in a future release."
        )
        return



[docs]
    def type_of_target(self):
        """
        Return the target type for the dataset.

        Returns
        -------
        target_type: TypedFeature
            an object of TypedFeature

        Examples
        --------
        >>> ds = ds.set_target('target_class')
        >>> assert(ds.type_of_target() == 'categorical')
        """
        return self.target.type



[docs]
    def train_test_split(self, test_size=0.1, random_state=utils.random_state):
        """
        Splits  dataset to train and test data.

        Parameters
        ----------
        test_size: Union[float, int], optional, default=0.1
        random_state: Union[int, RandomState], optional, default=None

                - If int, random_state is the seed used by the random number generator;
                - If RandomState instance, random_state is the random number generator;
                - If None, the random number generator is the RandomState instance used by np.random.

        Returns
        -------
        train_data, test_data: tuple
            tuple of ADSData instances

        Examples
        --------
        >>> ds = DatasetFactory.open("data.csv")
        >>> train, test = ds.train_test_split()
        """
        X_train, X_test, y_train, y_test = train_test_split(
            self.df.drop(self.target.name, axis=1),
            self.df[self.target.name],
            test_size=test_size,
            train_size=1 - test_size,
            random_state=random_state,
        )
        train = ADSData.build(
            X=X_train, y=y_train, name="Train Data", dataset_type=self.__class__
        )
        train.transformer_pipeline = self.transformer_pipeline
        test = ADSData.build(
            X=X_test, y=y_test, name="Test Data", dataset_type=self.__class__
        )
        return train, test



[docs]
    def train_validation_test_split(
        self, test_size=0.1, validation_size=0.1, random_state=utils.random_state
    ):
        """
        Splits  dataset to train, validation and test data.

        Parameters
        ----------
        test_size: Union[float, int], optional, default=0.1
        validation_size: Union[float, int], optional, default=0.1
        random_state: Union[int, RandomState], optional, default=None

                - If int, random_state is the seed used by the random number generator;
                - If RandomState instance, random_state is the random number generator;
                - If None, the random number generator is the RandomState instance used by np.random.

        Returns
        -------
        train_data, validation_data, test_data: tuple
            tuple of ADSData instances

        Examples
        --------
        >>> ds = DatasetFactory.open("data.csv")
        >>> train, valid, test = ds.train_validation_test_split()
        """
        train, test = self.train_test_split(
            test_size=test_size, random_state=random_state
        )
        X_train, X_valid, y_train, y_valid = train_test_split(
            train.X, train.y, test_size=validation_size, random_state=random_state
        )
        train.X = X_train
        train.y = y_train
        valid = ADSData.build(
            X=X_valid, y=y_valid, name="Validation Data", dataset_type=self.__class__
        )
        return train, valid, test


    """
    Internal methods
    """

    def __repr__(self):
        rows, cols = self.shape
        return f"{self.__class__.__name__}(target: {self.target.name}) {rows:,} rows, {cols:,} columns"

    def _transform(
        self,
        progress=DummyProgressBar(),
        fix_imbalance=True,
        correlation_threshold=0.7,
        frac=None,
        correlation_methods="pearson",
    ):
        progress.update("Building the transformer pipeline")
        if self.recommendation_transformer is None:
            transformer_pipeline = TransformerPipeline(
                steps=[
                    (
                        "recommendations",
                        self._get_recommendations_transformer(
                            fix_imbalance=fix_imbalance,
                            correlation_threshold=correlation_threshold,
                            frac=frac,
                            correlation_methods=correlation_methods,
                        ),
                    ),
                    (
                        "feature_engineering",
                        FeatureEngineeringTransformer(
                            feature_metadata=self.feature_types
                        ),
                    ),
                ]
            )
        else:
            # recommendations are already generated using get_recommendations().show_in_notebook() API
            transformer_pipeline = TransformerPipeline(
                steps=[
                    (
                        "feature_engineering",
                        FeatureEngineeringTransformer(
                            feature_metadata=self.feature_types
                        ),
                    )
                ]
            )
            transformer_pipeline.steps = [
                ("recommendations", self.recommendation_transformer)
            ] + transformer_pipeline.steps
        sampled_df = self.sampled_df.copy()
        self.recommendation_transformer = None
        df = self.df.copy()
        for step in transformer_pipeline.steps:
            progress.update("Applying transformation for %s" % step[0])
            sampled_df = step[1].fit_transform(sampled_df)
            df = step[1].transform(df, progress=progress, fit_transform=True)
        return df, sampled_df, transformer_pipeline

    def _get_best_features(self, score_func=None, k=12):
        if isinstance(self.target.type, DateTimeTypedFeature):
            return FeatureImportance._get_feature_ranking(
                self.sampled_df.copy(),
                self.target.name,
                self.type_of_target(),
                score_func=score_func,
                k=k,
            )
        else:
            return FeatureImportance._get_feature_ranking(
                self.sampled_df.copy(),
                self.target.name,
                self.type_of_target(),
                score_func=score_func,
                k=k,
            )

    def _get_best_features_transformer(self, score_func=None, k=12):
        feature_set = self._get_best_features(k=k, score_func=score_func)[
            "features"
        ].tolist()

        def _select_features(df, feature_names, target):
            if target in df.columns:
                feature_names = feature_names + [target]
            return df[feature_names]

        return (
            "select_{0}_best_features".format(k),
            FunctionTransformer(
                func=_select_features,
                validate=False,
                kw_args={"feature_names": feature_set, "target": self.target.name},
            ).fit(self.sampled_df),
        )

    def _get_recommendations_transformer(
        self,
        fix_imbalance=True,
        auto_transform=True,
        correlation_threshold=0.7,
        **kwargs,
    ):
        force_recompute = kwargs.pop("force_recompute", False)
        frac = kwargs.pop("frac", 1)
        correlation_methods = kwargs.pop("correlation_methods", "pearson")
        return RecommendationTransformer(
            feature_metadata=self.feature_types,
            correlation=self.corr(
                force_recompute=force_recompute,
                frac=frac,
                correlation_methods=correlation_methods,
                **kwargs,
            ),
            target=self.target.name,
            target_type=self.target.type,
            is_balanced=self.target.is_balanced(),
            feature_ranking=self._get_best_features(k=len(self.sampled_df)),
            fix_imbalance=fix_imbalance,
            len=self.__len__(),
            auto_transform=auto_transform,
            correlation_threshold=correlation_threshold,
        )