Source code for ads.type_discovery.datetime_detector

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/


from __future__ import print_function, absolute_import, division

import pandas as pd

from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)
from ads.type_discovery import logger
from ads.type_discovery.abstract_detector import AbstractTypeDiscoveryDetector
from ads.type_discovery.typed_feature import DateTimeTypedFeature


[docs] class DateTimeDetector(AbstractTypeDiscoveryDetector): @runtime_dependency(module="datefinder", install_from=OptionalDependency.DATA) def _is_date_time(self, name, values, low_level_type_name): if low_level_type_name.startswith("datetime64"): return lambda x: x else: # # if the column/feature contains the word "timestamp" then # if low_level_type_name.startswith("int") and "timestamp" in name.lower(): # either s (max len 10) on ns (max len) 19 unit = "s" if values.astype("str").str.len().max() <= 10 else "ns" try: pd.to_datetime(values, unit=unit) return lambda x: pd.to_datetime(x, unit=unit) except: pass if values.dtype == "object": try: pd.to_datetime(values, infer_datetime_format=True) datefinder_result = all( [bool(list(datefinder.find_dates(str(x)))) for x in values] ) if datefinder_result: return lambda x: pd.to_datetime(x, infer_datetime_format=True) except: pass return None
[docs] def discover(self, name, series): candidates = series.loc[~series.isnull()] fn = self._is_date_time(name, candidates.head(500), series.dtype.name) if fn: logger.debug("column [{}]/[{}] datetime".format(name, series.dtype)) return DateTimeTypedFeature.build(name, fn(series)) return False
if __name__ == "__main__": dd = DateTimeDetector() print( dd.discover( "date-range", pd.Series(pd.date_range(start="1/1/2018", end="1/08/2018", freq="H")), ) ) print(dd.discover("dates", pd.Series(["12/12/12", "12/12/13", None, "12/12/14"]))) print( dd.discover( "dates-with-other-values", pd.Series(["12/12/12", "Monday", None, "12/12/14"]), ) ) print( dd.discover( "timestamp s", pd.Series( [ 978300760, 978302109, 978301968, 978300275, 978824291, 978302268, 978302039, ] ), ) ) print(dd.discover("timestamp ns", pd.Series([1490195805433502912])))