ads.feature_engineering.adsstring package

Subpackages

Submodules

ads.feature_engineering.adsstring.common_regex_mixin module

class ads.feature_engineering.adsstring.common_regex_mixin.CommonRegex(text='')

Bases: object

class regex(obj, regex)

Bases: object

regexes = {'address': re.compile('\\d{1,5} [\\w\\s]{1,30}(?:street|st|crescent|avenue|ave|road|rd|highway|hwy|square|sq|trail|trl|drive|dr|court|ct|park|parkway|pkwy|circle|cir|boulevard|blvd)\\W?(?=\\s|$)', re.IGNORECASE), 'address_with_zip': re.compile('\\d{1,5} [\\w\\s]{1,30}(?:street|st(?:\\s|\\.)+|avenue|ave(?:\\s|\\.)+|road|rd(?:\\s|\\.)+|highway|hwy(?:\\s|\\.)+|square|sq(?:\\s|\\.)+|trail|trl(?:\\s|\\.)+|drive|dr(?:\\s|\\.)+|court|ct(?:\\s|\\.), re.IGNORECASE), 'credit_card': re.compile('((?:(?:\\d{4}[- ]?){3}\\d{4}|\\d{15,16}))(?![\\d])'), 'date': re.compile('(?:(?<!\\:)(?<!\\:\\d)[0-3]?\\d(?:st|nd|rd|th)?\\s+(?:of\\s+)?(?:jan\\.?|january|feb\\.?|february|mar\\.?|march|apr\\.?|april|may|jun\\.?|june|jul\\.?|july|aug\\.?|august|sep\\.?|september|oct\\.?|oc, re.IGNORECASE), 'email': re.compile("([a-z0-9!#$%&\\'*+\\/=?^_`{|.}~-]+@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?)", re.IGNORECASE), 'ip': re.compile('(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)', re.IGNORECASE), 'ipv6': re.compile('\\s*(?!.*::.*::)(?:(?!:)|:(?=:))(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)){6}(?:[0-9a-f]{0,4}(?:(?<=::)|(?<!::):)[0-9a-f]{0,4}(?:(?<=::)|(?<!:)|(?<=:)(?<!::):)|(?:25[0-4]|2[0-4]\\d|1\\d\\d|[1-9]?\\d)(?:\\, re.IGNORECASE|re.DOTALL|re.VERBOSE), 'link': re.compile('(?i)((?:https?://|www\\d{0,3}[.])?[a-z0-9.\\-]+[.](?:(?:international)|(?:construction)|(?:contractors)|(?:enterprises)|(?:photography)|(?:immobilien)|(?:management)|(?:technology)|(?:directory)|(?:e, re.IGNORECASE), 'phone_number_US': re.compile('((?:(?<![\\d-])(?:\\+?\\d{1,3}[-.\\s*]?)?(?:\\(?\\d{3}\\)?[-.\\s*]?)?\\d{3}[-.\\s*]?\\d{4}(?![\\d-]))|(?:(?<![\\d-])(?:(?:\\(\\+?\\d{2}\\))|(?:\\+?\\d{2}))\\s*\\d{2}\\s*\\d{3}\\s*\\d{4}(?![\\d-])))'), 'phone_number_US_with_ext': re.compile('((?:(?:\\+?1\\s*(?:[.-]\\s*)?)?(?:\\(\\s*(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9])\\s*\\)|(?:[2-9]1[02-9]|[2-9][02-8]1|[2-9][02-8][02-9]))\\s*(?:[.-]\\s*)?)?(?:[2-9]1[02-9]|[2-9][02-9]1|[2-9][0, re.IGNORECASE), 'po_box': re.compile('P\\.? ?O\\.? Box \\d+', re.IGNORECASE), 'price': re.compile('[$]\\s?[+-]?[0-9]{1,3}(?:(?:,?[0-9]{3}))*(?:\\.[0-9]{1,2})?'), 'ssn': re.compile('(?!666|000|9\\d{2})\\d{3}[- ](?!00)\\d{2}[- ](?!0{4})\\d{4}'), 'time': re.compile('\\d{1,2}:\\d{2} ?(?:[ap]\\.?m\\.?)?|\\d[ap]\\.?m\\.?', re.IGNORECASE), 'zip_code': re.compile('\\b\\d{5}(?:[-\\s]\\d{4})?\\b')}
class ads.feature_engineering.adsstring.common_regex_mixin.CommonRegexMixin

Bases: object

property address
property credit_card
property date
property email
property ip
property phone_number_US
property price
redact(fields: Union[List[str], Dict[str, str]]) str

Remove personal information in a string. For example, “Jane’s phone number is 123-456-7890” is turned into “Jane’s phone number is [phone_number_US].”

Parameters:

fields ((list(str) | dict)) – either a list of fields to redact, e.g. [‘email’, ‘phone_number_US’], in which case the redacted text is replaced with capitalized word like [EMAIL] or [PHONE_NUMBER_US_WITH_EXT], or a dictionary where key is a field to redact and value is the replacement text, e.g., {‘email’: ‘HIDDEN_EMAIL’}.

Returns:

redacted string

Return type:

str

redact_map = {'address': '[ADDRESS]', 'address_with_zip': '[ADDRESS_WITH_ZIP]', 'credit_card': '[CREDIT_CARD]', 'date': '[DATE]', 'email': '[EMAIL]', 'ip': '[IP]', 'ipv6': '[IPV6]', 'link': '[LINK]', 'phone_number_US': '[PHONE_NUMBER_US]', 'phone_number_US_with_ext': '[PHONE_NUMBER_US_WITH_EXT]', 'po_box': '[PO_BOX]', 'price': '[PRICE]', 'ssn': '[SSN]', 'time': '[TIME]', 'zip_code': '[ZIP_CODE]'}
property ssn
property time
property zip_code

ads.feature_engineering.adsstring.oci_language module

ads.feature_engineering.adsstring.string module

Module contents