Source code for ads.feature_engineering.feature_type.adsstring.parsers.spacy_parser

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import itertools
from collections import Counter
from typing import List, Sequence, Tuple
import os
import functools
import pandas as pd

from ads.feature_engineering.feature_type.adsstring.parsers.base import Parser
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)


# This module is only used when user installed spacy. spacy has conflicts with oci-cli.
[docs] class SpacyParser(Parser): # pragma: no cover @runtime_dependency(module="spacy", install_from=OptionalDependency.TEXT) def __init__(self): try: # for ADS conda packs, spacy files are located under $CONDA_PREFIX/spacy self.nlp = spacy.load(f"{os.environ['CONDA_PREFIX']}/spacy") except OSError: try: # if not inside a conda pack, trying loading first # if failed download to default path self.nlp = spacy.load("en_core_web_sm") except: from spacy.cli import download download("en_core_web_sm") self.nlp = spacy.load("en_core_web_sm") @property @functools.lru_cache() def _parsed_nlp_obj(self): return self.nlp(self.string) @property @functools.lru_cache() def pos(self) -> pd.DataFrame: pos = ( (token.text, token.pos_) for token in self._parsed_nlp_obj if not token.is_space ) return pd.DataFrame(data=pos, columns=["Word", "Label"]) def _pos_tokens(self, pos: str) -> List[str]: return _f7( [ token.text for token in self._parsed_nlp_obj if ( not token.is_stop and not token.is_punct and not token.is_space and token.pos_ == pos ) ] ) @property @functools.lru_cache() def noun(self) -> List[str]: return self._pos_tokens("NOUN") @property @functools.lru_cache() def adjective(self) -> List[str]: return self._pos_tokens("ADJ") @property @functools.lru_cache() def adverb(self) -> List[str]: return self._pos_tokens("ADV") @property @functools.lru_cache() def verb(self) -> List[str]: return self._pos_tokens("VERB") @property @functools.lru_cache() def noun_phrase(self) -> List[str]: np = [] for sentence in self.sentence: for chunk in self.nlp(sentence).noun_chunks: if chunk.text.strip(): np.append(chunk.text.strip()) return np @property @functools.lru_cache() def entity_extract(self) -> pd.DataFrame: return pd.DataFrame(data=self._entity_extract(), columns=["Entity", "Label"]) @functools.lru_cache() def _entity_extract(self) -> List[Tuple[str, str]]: entities = [] for sentence in self.sentence: entities.extend([(ent.text, ent.label_) for ent in self.nlp(sentence).ents]) return _f7(entities) def _entity_tokens(self, entity_label: str) -> List[str]: return [ token for (token, label) in self._entity_extract() if label == entity_label ] @property @functools.lru_cache() def entity_people(self) -> List[str]: # People, including fictional. return self._entity_tokens("PERSON") @property @functools.lru_cache() def entity_location(self) -> List[str]: # Location entity, i.e. countries, cities, states, mountain ranges, bodies of water, buildings, airports, highways, bridges return ( self._entity_tokens("GPE") + self._entity_tokens("LOC") + self._entity_tokens("FAC") ) @property @functools.lru_cache() def entity_organization(self) -> List[str]: # Companies, agencies, institutions. return self._entity_tokens("ORG") @property @functools.lru_cache() def entity_artwork(self) -> List[str]: # title of books, songs, etc return self._entity_tokens("WORK_OF_ART") @property @functools.lru_cache() def entity_product(self) -> List[str]: # product names, etc return self._entity_tokens("PRODUCT") @property @functools.lru_cache() def sentence(self) -> List[str]: return [sent.text.strip() for sent in self._parsed_nlp_obj.sents] @property @functools.lru_cache() def token(self) -> List[str]: return [token.text for token in self._parsed_nlp_obj] def _words_by_sentence(self) -> List[str]: x = [] for sentence in self.sentence: x.append([token.text for token in self.nlp(sentence) if token.is_alpha]) return x @property @functools.lru_cache() def word(self) -> List[str]: return list(itertools.chain(*self._words_by_sentence())) def _ngrams(self, tokens, n): return [tokens[i : i + n] for i in range(len(tokens) - n + 1)] @property @functools.lru_cache() def bigram(self) -> pd.DataFrame: bigram = list( itertools.chain( *[ self._ngrams(sentence_words, 2) for sentence_words in self._words_by_sentence() ] ) ) return pd.DataFrame(data=bigram, columns=["Word 1", "Word 2"]) @property @functools.lru_cache() def trigram(self) -> pd.DataFrame: trigram = list( itertools.chain( *[ self._ngrams(sentence_words, 3) for sentence_words in self._words_by_sentence() ] ) ) return pd.DataFrame(data=trigram, columns=["Word 1", "Word 2", "Word 3"]) @property @functools.lru_cache() def lemma(self) -> List[str]: return [token.lemma_ for token in self._parsed_nlp_obj] @property @functools.lru_cache() def word_count(self) -> pd.DataFrame: word_count = Counter([word.lower() for word in self.word]).most_common() df = pd.DataFrame(data=word_count, columns=["Word", "Count"]) return df.sort_values( ["Count", "Word"], ascending=[False, True], ignore_index=True )
def _f7(seq: Sequence) -> List: """order preserving de-duplicate sequence""" seen = set() seen_add = seen.add return [x for x in seq if not (x in seen or seen_add(x))]