Source code for ads.type_discovery.document_detector

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at

from __future__ import print_function, absolute_import, division

import re

import pandas as pd

from ads.type_discovery import logger
from ads.type_discovery.abstract_detector import AbstractTypeDiscoveryDetector
from ads.type_discovery.typed_feature import DocumentTypedFeature, AddressTypedFeature

[docs] class DocumentDetector(AbstractTypeDiscoveryDetector): _min_cjk_chars_for_document = 100 _min_words = 10 _min_html_tags = 5 _html_pattern = re.compile("<.*?>") _unicode_ranges = [ {"from": ord(u"\u3300"), "to": ord(u"\u33ff")}, # compatibility ideographs {"from": ord(u"\ufe30"), "to": ord(u"\ufe4f")}, # compatibility ideographs {"from": ord(u"\uf900"), "to": ord(u"\ufaff")}, # compatibility ideographs { "from": ord(u"\U0002F800"), "to": ord(u"\U0002fa1f"), }, # compatibility ideographs {"from": ord(u"\u3040"), "to": ord(u"\u309f")}, # Japanese Hiragana {"from": ord(u"\u30a0"), "to": ord(u"\u30ff")}, # Japanese Katakana {"from": ord(u"\u2e80"), "to": ord(u"\u2eff")}, # cjk radicals supplement {"from": ord(u"\u4e00"), "to": ord(u"\u9fff")}, {"from": ord(u"\u3400"), "to": ord(u"\u4dbf")}, {"from": ord(u"\U00020000"), "to": ord(u"\U0002a6df")}, {"from": ord(u"\U0002a700"), "to": ord(u"\U0002b73f")}, {"from": ord(u"\U0002b740"), "to": ord(u"\U0002b81f")}, { "from": ord(u"\U0002b820"), "to": ord(u"\U0002ceaf"), }, # included as of Unicode 8.0 ] def _is_cjk_char(self, char): return any( [ range["from"] <= ord(char) <= range["to"] for range in DocumentDetector._unicode_ranges ] )
[docs] def cjk_string(self, document): cjk_char_count = sum([1 if self._is_cjk_char(c) else 0 for c in document]) return cjk_char_count / len(document) >= 0.2
[docs] def html_document(self, document): return ( len(list(re.finditer(DocumentDetector._html_pattern, document))) > DocumentDetector._min_html_tags )
[docs] def discover(self, name, series): # # very basic detection of a document. If the document is CJK then we use only the document length # otherwise we split on whitespace and confirm that there are word-like strings # if series.dtype == "object": null_series = series.loc[~series.isnull()] first_non_null_document = null_series.iloc[0] if isinstance(first_non_null_document, str): is_html = self.html_document(first_non_null_document) if self.cjk_string(first_non_null_document): if ( len(first_non_null_document) >= DocumentDetector._min_cjk_chars_for_document ): tf = name, series, is_cjk=True, is_html=is_html ) logger.debug( "type discovery on CJK column [{}]/[{}] found to be a document".format( name, series.dtype ) ) return tf else: # find rows with above average length above_avg_series = null_series.loc[ null_series.str.len() >= null_series.str.len().mean() ] # take a sample. max 500 docs above_avg_series_sample = above_avg_series.sample( n=min(500, len(above_avg_series)) ) # if all of the samples have more than min_words tokens.. mean_number_of_words = ( above_avg_series_sample.str.split().str.len().mean() ) if mean_number_of_words > DocumentDetector._min_words: if ( mean_number_of_words < 15 and above_avg_series_sample.str.count(",").mean() / mean_number_of_words > 0.1 ): # many commas probably means address type logger.debug( "type discovery on column [{}]/[{}] looks like an address type".format( name, series.dtype ) ) return, series) else: logger.debug( "type discovery on non-CJK column [{}]/[{}] found to be a document".format( name, series.dtype ) ) # previous check of first document for HTML is now refined using longer documents is_html = all( [ self.html_document(doc) for doc in above_avg_series_sample ] ) return name, series, is_cjk=False, is_html=is_html ) return False
if __name__ == "__main__": dd = DocumentDetector()