Source code for ads.feature_engineering.accessor.mixin.eda_mixin

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
This exploratory data analysis (EDA) Mixin is used in the ADS accessor for the Pandas Dataframe.
The series of purpose-driven methods enable the data scientist to complete analysis on the dataframe.

From the accessor we have access to the pandas object the user is interacting with as well as
corresponding lists of feature types per column.
"""

import collections
import pandas as pd
import matplotlib.pyplot as plt
from ads.feature_engineering.accessor.mixin.correlation import (
    cat_vs_cat,
    cont_vs_cont,
    cat_vs_cont,
)
from ads.feature_engineering.accessor.mixin.utils import (
    _continuous_columns,
    _categorical_columns,
    _sienna_light_to_dark_color_palette,
)
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)


[docs]class EDAMixin:
[docs] def feature_count(self) -> pd.DataFrame: """ Counts the number of columns for each feature type and each primary feature. The column of primary is the number of primary feature types that is assigned to the column. Returns ------- Dataframe with The number of columns for each feature type The number of columns for each primary feature Examples -------- >>> df.ads.feature_type {'PassengerId': ['ordinal', 'category'], 'Survived': ['ordinal'], 'Pclass': ['ordinal'], 'Name': ['category'], 'Sex': ['category']} >>> df.ads.feature_count() Feature Type Count Primary 0 category 3 2 1 ordinal 3 3 """ feature_count = collections.defaultdict(lambda: [0, 0]) for _, feature_types in self.feature_type.items(): feature_count[feature_types[0]][1] += 1 for ft in feature_types: feature_count[ft][0] += 1 return pd.DataFrame( [ (feature_type, count, primary) for feature_type, (count, primary) in feature_count.items() ], columns=["Feature Type", "Count", "Primary"], )
[docs] def feature_stat(self) -> pd.DataFrame: """Summary statistics Dataframe provided. This returns feature stats on each column using FeatureType summary method. Examples -------- >>> df = pd.read_csv('~/advanced-ds/tests/vor_datasets/vor_titanic.csv') >>> df.ads.feature_stat().head() Column Metric Value 0 PassengerId count 891.000 1 PassengerId mean 446.000 2 PassengerId standard deviation 257.354 3 PassengerId sample minimum 1.000 4 PassengerId lower quartile 223.500 Returns ------- :class:`pandas.DataFrame` Dataframe with 3 columns: name, metric, value """ stats = [] for col_name, col in self._obj.items(): for _, row in col.ads.feature_stat().iterrows(): stats.append([col_name, row["Metric"], row["Value"]]) df_stats = pd.DataFrame(stats, columns=["Column", "Metric", "Value"]) df_stats.value = df_stats.Value.round(3) return df_stats
[docs] def feature_plot(self) -> pd.DataFrame: """For every column in the dataframe plot generate a list of summary plots based on the most relevant feature type. Returns ------- :class:`pandas.DataFrame` Dataframe with 2 columns: 1. Column - feature name 2. Plot - plot object """ plots = [] for _, col in self._obj.items(): try: plot = col.ads.feature_plot() except: plot = None plots.append([col.name, plot]) return pd.DataFrame(plots, columns=["Column", "Plot"])
[docs] def pearson(self) -> pd.DataFrame: """Generate a Pearson correlation data frame for all continuous variable pairs. Gives a warning for dropped non-numerical columns. Returns ------- :class:`pandas.DataFrame` Pearson correlation data frame with the following 3 columns: 1. Column 1 (name of the first continuous column) 2. Column 2 (name of the second continuous column) 3. Value (correlation value) Note ____ Pairs will be replicated. For example for variables x and y, we'd have (x,y), (y,x) both with same correlation value. We'll also have (x,x) and (y,y) with value 1.0. """ continuous_cols = _continuous_columns(self._obj.ads.feature_type) return cont_vs_cont(self._obj[continuous_cols])
[docs] @runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ) def pearson_plot(self) -> plt.Axes: """Generate a heatmap of the Pearson correlation for all continuous variable pairs. Returns ------- Plot object Pearson correlation plot object that can be updated by the customer """ ax = plt.axes() df = ( self.pearson() .pivot_table(index="Column 1", columns="Column 2", values="Value") .rename_axis("") .rename_axis("", axis="columns") ) ax.set_title("Pearson's Correlation") return seaborn.heatmap(df, cmap=_sienna_light_to_dark_color_palette(), ax=ax)
[docs] def cramersv(self) -> pd.DataFrame: """Generate a Cramer's V correlation data frame for all categorical variable pairs. Gives a warning for dropped non-categorical columns. Returns ------- :class:`pandas.DataFrame` Cramer's V correlation data frame with the following 3 columns: 1. Column 1 (name of the first categorical column) 2. Column 2 (name of the second categorical column) 3. Value (correlation value) Note ____ Pairs will be replicated. For example for variables x and y, we would have (x,y), (y,x) both with same correlation value. We will also have (x,x) and (y,y) with value 1.0. """ categorical_cols = _categorical_columns(self._obj.ads.feature_type) return cat_vs_cat(self._obj[categorical_cols])
[docs] @runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ) def cramersv_plot(self) -> plt.Axes: """Generate a heatmap of the Cramer's V correlation for all categorical variable pairs. Gives a warning for dropped non-categorical columns. Returns ------- Plot object Cramer's V correlation plot object that can be updated by the customer """ ax = plt.axes() df = ( self.cramersv() .pivot_table(index="Column 1", columns="Column 2", values="Value") .rename_axis("") .rename_axis("", axis="columns") ) ax.set_title("Cramer's V") return seaborn.heatmap(df, cmap=_sienna_light_to_dark_color_palette(), ax=ax)
[docs] def correlation_ratio(self) -> pd.DataFrame: """Generate a Correlation Ratio data frame for all categorical-continuous variable pairs. Returns ------- :class:`pandas.DataFrame` Correlation Ratio correlation data frame with the following 3 columns: 1. Column 1 (name of the first categorical/continuous column) 2. Column 2 (name of the second categorical/continuous column) 3. Value (correlation value) Note ____ Pairs will be replicated. For example for variables x and y, we would have (x,y), (y,x) both with same correlation value. We will also have (x,x) and (y,y) with value 1.0. """ categorical_cols = _categorical_columns(self._obj.ads.feature_type) continuous_cols = _continuous_columns(self._obj.ads.feature_type) return cat_vs_cont(self._obj, categorical_cols, continuous_cols)
[docs] @runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ) def correlation_ratio_plot(self) -> plt.Axes: """Generate a heatmap of the Correlation Ratio correlation for all categorical-continuous variable pairs. Returns ------- Plot object Correlation Ratio correlation plot object that can be updated by the customer """ ax = plt.axes() df = ( self.correlation_ratio() .pivot_table(index="Column 1", columns="Column 2", values="Value") .rename_axis("") .rename_axis("", axis="columns") ) ax.set_title("Correlation Ratio") return seaborn.heatmap(df, cmap=_sienna_light_to_dark_color_palette(), ax=ax)
[docs] def warning(self) -> pd.DataFrame: """Generates a data frame that lists feature specific warnings. Returns ------- :class:`pandas.DataFrame` The list of feature specific warnings. Examples -------- >>> df.ads.warning() Column Feature Type Warning Message Metric Value -------------------------------------------------------------------------------------- 0 Age continuous Zeros Age has 38 zeros Count 38 1 Age continuous Zeros Age has 12.2% zeros Percentage 12.2% """ common_columns = ["Feature Type", "Warning", "Message", "Metric", "Value"] result_df = pd.DataFrame((), columns=["Column"] + common_columns) for col in self._obj.columns: warning_df = self._obj[col].ads.warning() if warning_df is not None: warning_df.insert(0, "Column", col) result_df = result_df.append(warning_df) return result_df.reset_index(drop=True)