Source code for ads.feature_engineering.feature_type.ip_address

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The module that represents an IpAddress feature type.

Classes:
    IpAddress
        The IpAddress feature type.
"""
import pandas as pd
import re
from ads.feature_engineering.feature_type.base import FeatureType
from ads.feature_engineering.utils import _count_unique_missing
from ads.feature_engineering import schema

PATTERNV4 = re.compile(
    r"(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
    re.IGNORECASE,
)

PATTERNV6 = re.compile(
    r"\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)(?:\.(?:25[0-4]|2[0-4]\d|1\d\d|[1-9]?\d)){3})\s*",
    re.VERBOSE | re.IGNORECASE | re.DOTALL,
)


[docs] def default_handler(data: pd.Series, *args, **kwargs) -> pd.Series: """Processes given data and indicates if the data matches requirements. Parameters ---------- data: :class:`pandas.Series` The data to process. Returns ------- :class:`pandas.Series` The logical list indicating if the data matches requirements. """ def _is_ip_address(x): return not pd.isnull(x) and ( PATTERNV4.match(str(x)) is not None or PATTERNV6.match(str(x)) is not None ) return data.apply(lambda x: True if _is_ip_address(x) else False)
[docs] class IpAddress(FeatureType): """ Type representing IP Address. Attributes ---------- description: str The feature type description. name: str The feature type name. warning: FeatureWarning Provides functionality to register warnings and invoke them. validator Provides functionality to register validators and invoke them. Methods -------- feature_stat(x: pd.Series) -> pd.DataFrame Generates feature statistics. Example ------- >>> from ads.feature_engineering.feature_type.ip_address import IpAddress >>> import pandas as pd >>> import numpy as np >>> s = pd.Series(['192.168.0.1', '2001:db8::', '', np.NaN, None], name='ip_address') >>> s.ads.feature_type = ['ip_address'] >>> IpAddress.validator.is_ip_address(s) 0 True 1 True 2 False 3 False 4 False Name: ip_address, dtype: bool """ description = "Type representing IP Address."
[docs] @staticmethod def feature_stat(x: pd.Series) -> pd.DataFrame: """Generates feature statistics. Feature statistics include (total)count, unique(count) and missing(count). Examples -------- >>> s = pd.Series(['2002:db8::', '192.168.0.1', '2001:db8::', '2002:db8::', np.NaN, None], name='ip_address') >>> s.ads.feature_type = ['ip_address'] >>> s.ads.feature_stat() Metric Value 0 count 6 1 unique 2 2 missing 2 Returns ------- :class:`pandas.DataFrame` Summary statistics of the Series provided. """ return _count_unique_missing(x)
[docs] @classmethod def feature_domain(cls, x: pd.Series) -> schema.Domain: """ Generate the domain of the data of this feature type. Examples -------- >>> s = pd.Series(['2002:db8::', '192.168.0.1', '2001:db8::', '2002:db8::', np.NaN, None], name='ip_address') >>> s.ads.feature_type = ['ip_address'] >>> s.ads.feature_domain() constraints: [] stats: count: 6 missing: 2 unique: 3 values: IpAddress Returns ------- ads.feature_engineering.schema.Domain Domain based on the IpAddress feature type. """ return schema.Domain( cls.__name__, cls.feature_stat(x).to_dict()[x.name], [], )
IpAddress.validator.register("is_ip_address", default_handler)