Source code for ads.feature_engineering.feature_type.continuous

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The module that represents a Continuous feature type.

Classes:
    Continuous
        The Continuous feature type.
"""
import matplotlib.pyplot as plt
import pandas as pd
from ads.feature_engineering.feature_type.base import FeatureType
from ads.feature_engineering.utils import (
    _add_missing,
    _set_seaborn_theme,
    SchemeTeal,
    _format_stat,
)
from ads.feature_engineering import schema
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)


[docs] class Continuous(FeatureType): """ Type representing continuous values. Attributes ---------- description: str The feature type description. name: str The feature type name. warning: FeatureWarning Provides functionality to register warnings and invoke them. validator Provides functionality to register validators and invoke them. Methods -------- feature_stat(x: pd.Series) -> pd.DataFrame Generates feature statistics. feature_plot(x: pd.Series) -> plt.Axes Shows distributions of datasets using box plot. """ description = "Type representing continuous values."
[docs] @staticmethod def feature_stat(x: pd.Series) -> pd.DataFrame: """Generates feature statistics. Feature statistics include (total)count, mean, standard deviation, sample minimum, lower quartile, median, 75%, upper quartile, skew and missing(count). Examples -------- >>> cts = pd.Series([13.32, 3.32, 4.3, 2.45, 6.34, 2.25, 4.43, 3.26, np.NaN, None], name='continuous') >>> cts.ads.feature_type = ['continuous'] >>> cts.ads.feature_stat() Metric Value 0 count 10.000 1 mean 4.959 2 standard deviation 3.620 3 sample minimum 2.250 4 lower quartile 3.058 5 median 3.810 6 upper quartile 4.908 7 sample maximum 13.320 8 skew 2.175 9 missing 2.000 Returns ------- :class:`pandas.DataFrame` Summary statistics of the Series or Dataframe provided. """ df_stat = x.describe() _format_stat(df_stat) df_stat["count"] = len(x) df_stat["skew"] = x.skew() df_stat = _add_missing(x, df_stat).to_frame() df_stat.iloc[:, 0] = df_stat.iloc[:, 0].round(3) return df_stat
[docs] @staticmethod @runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ) def feature_plot(x: pd.Series) -> plt.Axes: """ Shows distributions of datasets using box plot. Examples -------- >>> cts = pd.Series([13.32, 3.32, 4.3, 2.45, 6.34, 2.25, 4.43, 3.26, np.NaN, None], name='continuous') >>> cts.ads.feature_type = ['continuous'] >>> cts.ads.feture_plot() Returns ------- matplotlib.axes._subplots.AxesSubplot Plot object for the series based on the Continuous feature type. """ col_name = x.name if x.name else "continuous" df = x.to_frame(name=col_name) df = df[pd.to_numeric(df[col_name], errors="coerce").notnull()] if len(df.index): _set_seaborn_theme() return seaborn.boxplot( x=df[col_name], width=0.2, color=SchemeTeal.AREA_DARK )
[docs] @classmethod def feature_domain(cls, x: pd.Series) -> schema.Domain: """ Generate the domain of the data of this feature type. Examples -------- >>> cts = pd.Series([13.32, 3.32, 4.3, 2.45, 6.34, 2.25, 4.43, 3.26, np.NaN, None], name='continuous') >>> cts.ads.feature_type = ['continuous'] >>> cts.ads.feature_domain() constraints: [] stats: count: 10.0 lower quartile: 3.058 mean: 4.959 median: 3.81 missing: 2.0 sample maximum: 13.32 sample minimum: 2.25 skew: 2.175 standard deviation: 3.62 upper quartile: 4.908 values: Continuous Returns ------- ads.feature_engineering.schema.Domain Domain based on the Continuous feature type. """ return schema.Domain(cls.__name__, cls.feature_stat(x).to_dict()[x.name], [])