Source code for ads.feature_engineering.utils

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The module that represents utility functions.

Functions:
    is_boolean(value: Any) -> bool
        Checks if value type is boolean.
"""
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from ads.common.card_identifier import card_identify
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)
from ads.feature_engineering.dataset.zip_code_data import zip_code_dict
from functools import lru_cache
from typing import Any


[docs] class SchemeNeutral(str): BACKGROUND_LIGHT = "#F5F4F2" BACKGROUND_DARK = "#E4E1DD" AREA_LIGHT = "#BCB6B1" AREA_DARK = "#9E9892" LINE_LIGHT = "#665F5B" LINE_DARK = "#47423E"
[docs] class SchemeTeal(str): BACKGROUND_LIGHT = "#F0f6f5" BACKGROUND_DARK = "#D6E5E5" AREA_LIGHT = "#9ABFBF" AREA_DARK = "#76A2A0" LINE_LIGHT = "#3E686C" LINE_DARK = "#2B484B"
def _tag_to_snake(name: str) -> str: """Conversts string to snake representation. 1. Converts the string to the lower case. 2. Converts all spaces to underscore. Parameters ---------- name: string The name to convert. Returns ------- str: The name converted to the snake representation. """ _name = name.strip().lower() if len(_name.split()) > 1: _name = "_".join(_name.split()) return _name def _add_missing(x, df): """ Adds count of missing values. """ n_missing = pd.isnull(x.replace(r"", np.NaN)).sum() if n_missing > 0: df.loc["missing"] = n_missing return df def _count_unique_missing(x): """ Returns the total count, unique count and count of missing values of a series. """ df_stat = pd.Series( {"count": len(x), "unique": len(x.replace(r"", np.NaN).dropna().unique())}, name=x.name, ).to_frame() return _add_missing(x, df_stat)
[docs] def is_boolean(value: Any) -> bool: """Checks if value type is boolean. Parameters ---------- value: Any The value to check. Returns ------- bool: True if value is boolean, False otherwise. """ bool_values = ("yes", "y", "true", "t", "1", "no", "n", "false", "f", "0", "") return isinstance(value, bool) or str(value).lower() in bool_values
[docs] def assign_issuer(cardnumber): if pd.isnull(cardnumber): return "missing" else: return card_identify().identify_issue_network(cardnumber)
[docs] def random_color_func( z, word=None, font_size=None, position=None, orientation=None, font_path=None, random_state=None, ): """ Returns random color function use for color_func in creating WordCloud """ h = 179 s = 23 l = int(100.0 * float(random_state.randint(60, 120)) / 255.0) return "hsl({}, {}%, {}%)".format(h, s, l)
def _is_float(s: str): """ Checks if given string can convert to float """ return re.match(r"^-?\d+(?:\.\d+)$", s) != None def _str_lat_long_to_point(s): """ Converts input data into formated geometry point Return formated geometry point string or np.NaN if input string is not valid """ if isinstance(s, str): coords = s.split(",") if len(coords) == 2: lat, long = coords[0].lstrip(), coords[1].lstrip() if lat.startswith("("): lat = lat[1:] if long.endswith(")"): long = long[:-1] if _is_float(lat) and _is_float(long): return "POINT(" + long + " " + lat + ")" return np.NaN @runtime_dependency(module="geopandas", install_from=OptionalDependency.GEO) def _plot_gis_scatter(df: pd.DataFrame, lon: str, lat: str): """ Parameters ---------- df : pd.DataFrame dataframe contains gis data lon: str column name mapping to longitude lat: str column name mapping to latitude Returns ------- matplotlib.axes._subplots.AxesSubplot present location of given data on world map """ if len(df.index): if lon in df.columns and lat in df.columns: fig, ax = plt.subplots(facecolor=SchemeNeutral.BACKGROUND_LIGHT) gdf = geopandas.GeoDataFrame( df, geometry=geopandas.points_from_xy(df[lon], df[lat]) ) world = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres") ) world = world[world["name"] == "United States of America"] ax1 = world.plot( ax=ax, color=SchemeNeutral.AREA_LIGHT, linewidth=0.5, edgecolor="white" ) gdf.plot(ax=ax1, color=SchemeTeal.LINE_LIGHT, markersize=10) plt.axis("off") return ax1 def _to_lat_long(x: pd.Series, zipcode: pd.DataFrame): """ Parameters ---------- x: pd.Series pandas series contains zip code data zipcode : pd.DataFrame dataframe contains gis data of zip code Returns ------- pd.DataFrame dataframe with 2 columes including latitude and longitude data """ lats = [] longs = [] for s in x: if s in zipcode.index: lats.append(np.float64(zipcode.loc[s]["latitude"])) longs.append(np.float64(zipcode.loc[s]["longitude"])) df = pd.DataFrame(list(zip(lats, longs)), columns=["latitude", "longitude"]) return df @runtime_dependency(module="seaborn", install_from=OptionalDependency.VIZ) def _set_seaborn_theme(): """ Sets seaborn figure & axes facecolor """ plt.figure(facecolor=SchemeNeutral.BACKGROUND_LIGHT) seaborn.set( rc={ "axes.facecolor": SchemeNeutral.BACKGROUND_LIGHT, "figure.facecolor": SchemeNeutral.BACKGROUND_LIGHT, } ) def _format_stat(stat: pd.Series): """ Formats statistics row index. """ stat.rename( { "std": "standard deviation", "min": "sample minimum", "25%": "lower quartile", "50%": "median", "75%": "upper quartile", "max": "sample maximum", }, inplace=True, ) @lru_cache(maxsize=1) def _zip_code(): """Returns dataframe contain zip code, latitude and longitude data.""" return pd.DataFrame.from_dict( zip_code_dict, orient="index", columns=["longitude", "latitude"] )