Source code for ads.type_discovery.discrete_detector

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from __future__ import print_function, absolute_import, division

import pandas as pd
from sklearn.utils.multiclass import type_of_target

from ads.type_discovery import logger
from ads.type_discovery.abstract_detector import DiscreteDiscoveryDetector
from ads.type_discovery.typed_feature import (
    OrdinalTypedFeature,
    CategoricalTypedFeature,
)
from ads.common import utils



[docs]
class DiscreteDetector(DiscreteDiscoveryDetector):

    _max_categorical_values = 100

    def _get_categorical_or_ordinal(self, name, series):
        #
        # categoricals are unordered discreet types
        # ordinals are ordered discreet (int) types
        #

        low_level_type_name = series.dtype.name

        if low_level_type_name == "category" or low_level_type_name == "bool":
            return "categorical"

        else:
            #
            # after removing nulls the new Series might already be categorical
            #
            nulls_removed = pd.Series(list(series.loc[~series.isna()]))
            if (
                nulls_removed.dtype.name == "category"
                or nulls_removed.dtype.name == "bool"
            ):
                return "categorical"

            count_distinct = series.nunique()
            observations = series.size

            tot = type_of_target(
                list(nulls_removed.head(min(nulls_removed.size, 2000)))
            )

            if tot == "binary":
                return "categorical"

            elif tot == "multiclass":
                if count_distinct <= DiscreteDetector._max_categorical_values:
                    if low_level_type_name in utils.numeric_pandas_dtypes():
                        return "ordinal"
                    else:
                        return "categorical"

                if low_level_type_name.startswith(
                    "int"
                ) or low_level_type_name.startswith("float"):
                    if nulls_removed.min() >= 0:
                        if (
                            low_level_type_name.startswith("int")
                            or nulls_removed.sum()
                            == nulls_removed.astype("int64").sum()
                        ):
                            return "ordinal"
                    # by summing all the values and summing all the int values we can know all the values are integers

        return False


[docs]
    def discover(self, name, series):

        guessed_type = self._get_categorical_or_ordinal(
            name, series.loc[~series.isnull()]
        )

        if guessed_type == "categorical":
            logger.debug("column [{}]/[{}] categorical".format(name, series.dtype))
            return CategoricalTypedFeature.build(name, series)
        elif guessed_type == "ordinal":
            logger.debug("column [{}]/[{}] ordinal".format(name, series.dtype))
            return OrdinalTypedFeature.build(name, series)
        else:
            return False




if __name__ == "__main__":
    dd = DiscreteDetector()

    print(
        dd.discover(
            "str-categorical",
            pd.Series(["a", "a", "a", "b", "c", "a"], dtype="category"),
        )
    )
    print(
        dd.discover(
            "bool-categorical", pd.Series([True, False, True, True, True, None, True])
        )
    )
    print(dd.discover("continuous", pd.Series([None, 3.14, 12.0, 1, 2, 3, None])))
    print(
        dd.discover("int-1-categorical", pd.Series([1, 1, 1, 5, 5, 5, 5, 5, 5, 5, 9]))
    )
    print(dd.discover("int-2-categorical", pd.Series([1, 1, 1, 5, 9])))
    print(dd.discover("real-3-categorical", pd.Series([1.0, 2.0, 3.0, 1.0, 4.0, 5.0])))
    print(
        dd.discover(
            "bool-categorical", pd.Series([True, False, True, True, True, None, True])
        )
    )