Source code for ads.feature_engineering.feature_type.gis

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The module that represents a GIS feature type.

Classes:
    GIS
        The GIS feature type.
"""
import matplotlib.pyplot as plt
import pandas as pd
import re
from ads.feature_engineering.feature_type.base import FeatureType
from ads.feature_engineering.utils import (
    _count_unique_missing,
    _str_lat_long_to_point,
    SchemeNeutral,
    SchemeTeal,
)
from ads.feature_engineering import schema
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)

PATTERN = re.compile(r"^[(]?(\-?\d+\.\d+?),\s*(\-?\d+\.\d+?)[)]?$", re.VERBOSE)


[docs] def default_handler(data: pd.Series, *args, **kwargs) -> pd.Series: """Processes given data and indicates if the data matches requirements. Parameters ---------- data: :class:`pandas.Series` The data to process. Returns ------- :class:`pandas.Series` The logical list indicating if the data matches requirements. """ return data.apply( lambda x: True if not pd.isnull(x) and PATTERN.match(str(x)) is not None else False )
[docs] class GIS(FeatureType): """ Type representing geographic information. Attributes ---------- description: str The feature type description. name: str The feature type name. warning: FeatureWarning Provides functionality to register warnings and invoke them. validator Provides functionality to register validators and invoke them. Methods -------- feature_stat(x: pd.Series) -> pd.DataFrame Generates feature statistics. feature_plot(x: pd.Series) -> plt.Axes Shows the location of given address on map base on longitude and latitute. Example ------- >>> from ads.feature_engineering.feature_type.gis import GIS >>> import pandas as pd >>> s = pd.Series(["-18.2193965, -93.587285", "-21.0255305, -122.478584", "85.103913, 19.405744", "82.913736, 178.225672", "62.9795085,-66.989705", "54.5604395,95.235090", "24.2811855,-162.380403", "-1.818319,-80.681214", None, "(51.816119, 175.979008)", "(54.3392995,-11.801615)"], name='gis') >>> s.ads.feature_type = ['gis'] >>> GIS.validator.is_gis(s) 0 True 1 True 2 True 3 True 4 True 5 True 6 True 7 True 8 False 9 True 10 True Name: gis, dtype: bool """ description = "Type representing geographic information."
[docs] @staticmethod def feature_stat(x: pd.Series) -> pd.DataFrame: """Generates feature statistics. Feature statistics include (total)count, unique(count) and missing(count). Examples -------- >>> gis = pd.Series([ "69.196241,-125.017615", "5.2272595,-143.465712", "-33.9855425,-153.445155", "43.340610,86.460554", "24.2811855,-162.380403", "2.7849025,-7.328156", "45.033805,157.490179", "-1.818319,-80.681214", "-44.510428,-169.269477", "-56.3344375,-166.407038", "", np.NaN, None ], name='gis' ) >>> gis.ads.feature_type = ['gis'] >>> gis.ads.feature_stat() gis count 13 unique 10 missing 3 Returns ------- :class:`pandas.DataFrame` Summary statistics of the Series provided. """ return _count_unique_missing(x)
[docs] @staticmethod @runtime_dependency(module="geopandas", install_from=OptionalDependency.GEO) def feature_plot(x: pd.Series) -> plt.Axes: """ Shows the location of given address on map base on longitude and latitute. Examples -------- >>> gis = pd.Series([ "69.196241,-125.017615", "5.2272595,-143.465712", "-33.9855425,-153.445155", "43.340610,86.460554", "24.2811855,-162.380403", "2.7849025,-7.328156", "45.033805,157.490179", "-1.818319,-80.681214", "-44.510428,-169.269477", "-56.3344375,-166.407038", "", np.NaN, None ], name='gis' ) >>> gis.ads.feature_type = ['gis'] >>> gis.ads.feature_plot() Returns ------- matplotlib.axes._subplots.AxesSubplot Plot object for the series based on the GIS feature type. """ col_name = x.name if x.name else "gis" df = x.to_frame(name=col_name) df["validation"] = default_handler(x) df = df[df["validation"] == True] df = ( df[col_name] .apply(lambda x: _str_lat_long_to_point(x)) .to_frame(name="Coordinates") .dropna() ) if len(df.index): df["Coordinates"] = geopandas.GeoSeries.from_wkt(df["Coordinates"]) gdf = geopandas.GeoDataFrame(df, geometry="Coordinates") fig, ax = plt.subplots(facecolor=SchemeNeutral.BACKGROUND_LIGHT) world = geopandas.read_file( geopandas.datasets.get_path("naturalearth_lowres") ) world.plot( ax=ax, color=SchemeNeutral.AREA_LIGHT, linewidth=0.5, edgecolor="white" ) gdf.plot(ax=ax, color=SchemeTeal.LINE_LIGHT, markersize=10) plt.axis("off") return ax
[docs] @classmethod def feature_domain(cls, x: pd.Series) -> schema.Domain: """ Generate the domain of the data of this feature type. Examples -------- >>> gis = pd.Series([ "69.196241,-125.017615", "5.2272595,-143.465712", "-33.9855425,-153.445155", "43.340610,86.460554", "24.2811855,-162.380403", "2.7849025,-7.328156", "45.033805,157.490179", "-1.818319,-80.681214", "-44.510428,-169.269477", "-56.3344375,-166.407038", "", np.NaN, None ], name='gis' ) >>> gis.ads.feature_type = ['gis'] >>> gis.ads.feature_domain() constraints: [] stats: count: 13 missing: 3 unique: 10 values: GIS Returns ------- ads.feature_engineering.schema.Domain Domain based on the GIS feature type. """ return schema.Domain( cls.__name__, cls.feature_stat(x).to_dict()[x.name], [], )
GIS.validator.register("is_gis", default_handler)