Source code for ads.feature_engineering.utils

#!/usr/bin/env python

# Copyright (c) 2021, 2025 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The module that represents utility functions.

Functions:
    is_boolean(value: Any) -> bool
        Checks if value type is boolean.
"""

import re
from functools import lru_cache
from typing import Any

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from ads.common.card_identifier import card_identify
from ads.common.decorator.runtime_dependency import (
    OptionalDependency,
    runtime_dependency,
)
from ads.feature_engineering.dataset.zip_code_data import zip_code_dict



[docs]
class SchemeNeutral(str):
    BACKGROUND_LIGHT = "#F5F4F2"
    BACKGROUND_DARK = "#E4E1DD"
    AREA_LIGHT = "#BCB6B1"
    AREA_DARK = "#9E9892"
    LINE_LIGHT = "#665F5B"
    LINE_DARK = "#47423E"




[docs]
class SchemeTeal(str):
    BACKGROUND_LIGHT = "#F0f6f5"
    BACKGROUND_DARK = "#D6E5E5"
    AREA_LIGHT = "#9ABFBF"
    AREA_DARK = "#76A2A0"
    LINE_LIGHT = "#3E686C"
    LINE_DARK = "#2B484B"



def _tag_to_snake(name: str) -> str:
    """Conversts string to snake representation.
    1. Converts the string to the lower case.
    2. Converts all spaces to underscore.

    Parameters
    ----------
    name: string
        The name to convert.

    Returns
    -------
    str: The name converted to the snake representation.
    """
    _name = name.strip().lower()
    if len(_name.split()) > 1:
        _name = "_".join(_name.split())
    return _name


def _add_missing(x, df):
    """
    Adds count of missing values.
    """
    n_missing = pd.isnull(x.replace(r"", np.nan)).sum()
    if n_missing > 0:
        df.loc["missing"] = n_missing
    return df


def _count_unique_missing(x):
    """
    Returns the total count, unique count and count of missing values of a series.
    """
    df_stat = pd.Series(
        {"count": len(x), "unique": len(x.replace(r"", np.nan).dropna().unique())},
        name=x.name,
    ).to_frame()
    return _add_missing(x, df_stat)



[docs]
def is_boolean(value: Any) -> bool:
    """Checks if value type is boolean.

    Parameters
    ----------
    value: Any
        The value to check.

    Returns
    -------
    bool: True if value is boolean, False otherwise.
    """
    bool_values = ("yes", "y", "true", "t", "1", "no", "n", "false", "f", "0", "")
    return isinstance(value, bool) or str(value).lower() in bool_values




[docs]
def assign_issuer(cardnumber):
    if pd.isnull(cardnumber):
        return "missing"
    else:
        return card_identify().identify_issue_network(cardnumber)




[docs]
def random_color_func(
    z,
    word=None,
    font_size=None,
    position=None,
    orientation=None,
    font_path=None,
    random_state=None,
):
    """
    Returns random color function use for color_func in creating WordCloud
    """
    h = 179
    s = 23
    l = int(100.0 * float(random_state.randint(60, 120)) / 255.0)
    return f"hsl({h}, {s}%, {l}%)"



def _is_float(s: str):
    """
    Checks if given string can convert to float
    """
    return re.match(r"^-?\d+(?:\.\d+)$", s) != None


def _str_lat_long_to_point(s):
    """
    Converts input data into formated geometry point
    Return formated geometry point string or np.nan if input string is not valid
    """
    if isinstance(s, str):
        coords = s.split(",")
        if len(coords) == 2:
            lat, long = coords[0].lstrip(), coords[1].lstrip()
            if lat.startswith("("):
                lat = lat[1:]
            if long.endswith(")"):
                long = long[:-1]
            if _is_float(lat) and _is_float(long):
                return "POINT(" + long + " " + lat + ")"
    return np.nan


@runtime_dependency(module="geopandas", install_from=OptionalDependency.GEO)
def _plot_gis_scatter(df: pd.DataFrame, lon: str, lat: str):
    """
    Parameters
    ----------
    df : pd.DataFrame
        dataframe contains gis data
    lon: str
        column name mapping to longitude
    lat: str
        column name mapping to latitude
    Returns
    -------
        matplotlib.axes._subplots.AxesSubplot present location of given data on world map
    """
    if len(df.index):
        if lon in df.columns and lat in df.columns:
            fig, ax = plt.subplots(facecolor=SchemeNeutral.BACKGROUND_LIGHT)
            gdf = geopandas.GeoDataFrame(
                df, geometry=geopandas.points_from_xy(df[lon], df[lat])
            )
            world = geopandas.read_file(
                geopandas.datasets.get_path("naturalearth_lowres")
            )
            world = world[world["name"] == "United States of America"]
            ax1 = world.plot(
                ax=ax, color=SchemeNeutral.AREA_LIGHT, linewidth=0.5, edgecolor="white"
            )
            gdf.plot(ax=ax1, color=SchemeTeal.LINE_LIGHT, markersize=10)
            plt.axis("off")
            return ax1


def _to_lat_long(x: pd.Series, zipcode: pd.DataFrame):
    """
    Parameters
    ----------
    x: pd.Series
        pandas series contains zip code data
    zipcode : pd.DataFrame
        dataframe contains gis data of zip code
    Returns
    -------
    pd.DataFrame
        dataframe with 2 columes including latitude and longitude data
    """
    lats = []
    longs = []
    for s in x:
        if s in zipcode.index:
            lats.append(np.float64(zipcode.loc[s]["latitude"]))
            longs.append(np.float64(zipcode.loc[s]["longitude"]))
    df = pd.DataFrame(list(zip(lats, longs)), columns=["latitude", "longitude"])
    return df


@runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ)
def _set_seaborn_theme():
    """
    Sets seaborn figure & axes facecolor
    """
    plt.figure(facecolor=SchemeNeutral.BACKGROUND_LIGHT)
    seaborn.set(
        rc={
            "axes.facecolor": SchemeNeutral.BACKGROUND_LIGHT,
            "figure.facecolor": SchemeNeutral.BACKGROUND_LIGHT,
        }
    )


def _format_stat(stat: pd.Series):
    """
    Formats statistics row index.
    """
    stat.rename(
        {
            "std": "standard deviation",
            "min": "sample minimum",
            "25%": "lower quartile",
            "50%": "median",
            "75%": "upper quartile",
            "max": "sample maximum",
        },
        inplace=True,
    )


@lru_cache(maxsize=1)
def _zip_code():
    """Returns dataframe contain zip code, latitude and longitude data."""
    return pd.DataFrame.from_dict(
        zip_code_dict, orient="index", columns=["longitude", "latitude"]
    )