Source code for ads.feature_engineering.feature_type.adsstring.parsers.spacy_parser

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import itertools
from collections import Counter
from typing import List, Sequence, Tuple
import os
import functools
import pandas as pd

from ads.feature_engineering.feature_type.adsstring.parsers.base import Parser
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)


# This module is only used when user installed spacy. spacy has conflicts with oci-cli.

[docs]
class SpacyParser(Parser):  # pragma: no cover
    @runtime_dependency(module="spacy", install_from=OptionalDependency.TEXT)
    def __init__(self):
        try:
            # for ADS conda packs, spacy files are located under $CONDA_PREFIX/spacy
            self.nlp = spacy.load(f"{os.environ['CONDA_PREFIX']}/spacy")
        except OSError:
            try:
                # if not inside a conda pack, trying loading first
                # if failed download to default path
                self.nlp = spacy.load("en_core_web_sm")
            except:
                from spacy.cli import download

                download("en_core_web_sm")
                self.nlp = spacy.load("en_core_web_sm")

    @property
    @functools.lru_cache()
    def _parsed_nlp_obj(self):
        return self.nlp(self.string)

    @property
    @functools.lru_cache()
    def pos(self) -> pd.DataFrame:
        pos = (
            (token.text, token.pos_)
            for token in self._parsed_nlp_obj
            if not token.is_space
        )
        return pd.DataFrame(data=pos, columns=["Word", "Label"])

    def _pos_tokens(self, pos: str) -> List[str]:
        return _f7(
            [
                token.text
                for token in self._parsed_nlp_obj
                if (
                    not token.is_stop
                    and not token.is_punct
                    and not token.is_space
                    and token.pos_ == pos
                )
            ]
        )

    @property
    @functools.lru_cache()
    def noun(self) -> List[str]:
        return self._pos_tokens("NOUN")

    @property
    @functools.lru_cache()
    def adjective(self) -> List[str]:
        return self._pos_tokens("ADJ")

    @property
    @functools.lru_cache()
    def adverb(self) -> List[str]:
        return self._pos_tokens("ADV")

    @property
    @functools.lru_cache()
    def verb(self) -> List[str]:
        return self._pos_tokens("VERB")

    @property
    @functools.lru_cache()
    def noun_phrase(self) -> List[str]:
        np = []
        for sentence in self.sentence:
            for chunk in self.nlp(sentence).noun_chunks:
                if chunk.text.strip():
                    np.append(chunk.text.strip())

        return np

    @property
    @functools.lru_cache()
    def entity_extract(self) -> pd.DataFrame:
        return pd.DataFrame(data=self._entity_extract(), columns=["Entity", "Label"])

    @functools.lru_cache()
    def _entity_extract(self) -> List[Tuple[str, str]]:
        entities = []
        for sentence in self.sentence:
            entities.extend([(ent.text, ent.label_) for ent in self.nlp(sentence).ents])

        return _f7(entities)

    def _entity_tokens(self, entity_label: str) -> List[str]:
        return [
            token for (token, label) in self._entity_extract() if label == entity_label
        ]

    @property
    @functools.lru_cache()
    def entity_people(self) -> List[str]:
        # People, including fictional.
        return self._entity_tokens("PERSON")

    @property
    @functools.lru_cache()
    def entity_location(self) -> List[str]:
        # Location entity, i.e. countries, cities, states, mountain ranges, bodies of water, buildings, airports, highways, bridges
        return (
            self._entity_tokens("GPE")
            + self._entity_tokens("LOC")
            + self._entity_tokens("FAC")
        )

    @property
    @functools.lru_cache()
    def entity_organization(self) -> List[str]:
        # Companies, agencies, institutions.
        return self._entity_tokens("ORG")

    @property
    @functools.lru_cache()
    def entity_artwork(self) -> List[str]:
        # title of books, songs, etc
        return self._entity_tokens("WORK_OF_ART")

    @property
    @functools.lru_cache()
    def entity_product(self) -> List[str]:
        # product names, etc
        return self._entity_tokens("PRODUCT")

    @property
    @functools.lru_cache()
    def sentence(self) -> List[str]:
        return [sent.text.strip() for sent in self._parsed_nlp_obj.sents]

    @property
    @functools.lru_cache()
    def token(self) -> List[str]:
        return [token.text for token in self._parsed_nlp_obj]

    def _words_by_sentence(self) -> List[str]:
        x = []
        for sentence in self.sentence:
            x.append([token.text for token in self.nlp(sentence) if token.is_alpha])

        return x

    @property
    @functools.lru_cache()
    def word(self) -> List[str]:
        return list(itertools.chain(*self._words_by_sentence()))

    def _ngrams(self, tokens, n):
        return [tokens[i : i + n] for i in range(len(tokens) - n + 1)]

    @property
    @functools.lru_cache()
    def bigram(self) -> pd.DataFrame:
        bigram = list(
            itertools.chain(
                *[
                    self._ngrams(sentence_words, 2)
                    for sentence_words in self._words_by_sentence()
                ]
            )
        )
        return pd.DataFrame(data=bigram, columns=["Word 1", "Word 2"])

    @property
    @functools.lru_cache()
    def trigram(self) -> pd.DataFrame:
        trigram = list(
            itertools.chain(
                *[
                    self._ngrams(sentence_words, 3)
                    for sentence_words in self._words_by_sentence()
                ]
            )
        )
        return pd.DataFrame(data=trigram, columns=["Word 1", "Word 2", "Word 3"])

    @property
    @functools.lru_cache()
    def lemma(self) -> List[str]:
        return [token.lemma_ for token in self._parsed_nlp_obj]

    @property
    @functools.lru_cache()
    def word_count(self) -> pd.DataFrame:
        word_count = Counter([word.lower() for word in self.word]).most_common()
        df = pd.DataFrame(data=word_count, columns=["Word", "Count"])
        return df.sort_values(
            ["Count", "Word"], ascending=[False, True], ignore_index=True
        )



def _f7(seq: Sequence) -> List:
    """order preserving de-duplicate sequence"""
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]