Source code for ads.data_labeling.visualizer.text_visualizer

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The module that helps to visualize NER Text Dataset.

Methods
-------
    render(items: List[LabeledTextItem], options: Dict = None) -> str
        Renders NER dataset to Html format.

Examples
--------
>>> record1 = LabeledTextItem("London is the capital of the United Kingdom", [NERItem('city', 0, 6), NERItem("country", 29, 14)])
>>> record2 = LabeledTextItem("Houston area contractor seeking a Sheet Metal Superintendent.", [NERItem("city", 0, 6)])
>>> result = render(items = [record1, record2], options={"default_color":"#DDEECC", "colors": {"city":"#DDEECC", "country":"#FFAAAA"}})
>>> display(HTML(result))
"""

import logging
from dataclasses import asdict, dataclass
from string import Template
from typing import Dict, List, Optional

import pandas as pd
from ads.data_labeling.constants import AnnotationType
from ads.data_labeling.ner import NERItem
from cerberus import Validator


logger = logging.getLogger(__name__)

HTML_OPTIONS_SCHEMA = {
    "default_color": {"nullable": True, "required": False, "type": "string"},
    "colors": {
        "nullable": True,
        "required": False,
        "type": "dict",
    },
}

DEFAULT_COLOR = "#cedddd"
LEN_OF_SPACY_ITEM = 3


[docs] @dataclass class LabeledTextItem: """Data class representing NER Item. Attributes ---------- txt: str The labeled sentence. ents: List[NERItem] The list of entities. """ txt: str ents: List[NERItem] def _validate(self): """Validates the instance. Returns ------- None Nothing. Raises ------ ValueError If txt is empty. WrongEntityFormat If the list of entities has a wrong format. AssertionError In case of overlapped entities. """ if not self.txt: raise ValueError( "The parameter `txt` is required and must not be an empty string." ) if not isinstance(self.ents, List): raise ValueError( "Invalid format for the entities. The entities must be a List[NERItem]." ) for entity in self.ents: if entity.offset + entity.length > len(self.txt): raise ValueError( f"At least one of the entities (start index: {entity.length}, offset: {entity.offset}) " f"exceeds the length of the text ({len(self.txt)})." ) self.ents.sort(key=lambda x: x.offset) for i in range(len(self.ents) - 1): if self.ents[i].offset + self.ents[i].length >= self.ents[i + 1].offset: raise AssertionError( "The entity data contains overlapping tokens. The first token has a start index" f" of {self.ents[i].length}, and an offset of {self.ents[i].offset}. The second token has a start " f"index of {self.ents[i + 1].length}, and an offset of {self.ents[i + 1].offset}. " ) def __post_init__(self): self._validate()
[docs] @dataclass class RenderOptions: """Data class representing render options. Attributes ---------- default_color: str The specified default color. colors: Optional[dict] The multiple specified colors. """ default_color: str colors: Optional[dict] @staticmethod def _validate(options: dict) -> None: """Validate whether the options passed in fits the defined schema. Parameters ---------- options: dict The multiple specified colors. Returns ------- None Nothing. """ if not options: return None validator = Validator(HTML_OPTIONS_SCHEMA) valid = validator.validate(options) if not valid: raise ValueError(validator.errors)
[docs] @classmethod def from_dict(cls, options: dict) -> "RenderOptions": """Constructs an instance of RenderOptions from a dictionary. Parameters ---------- options: dict Render options in dictionary format. Returns ------- RenderOptions The instance of RenderOptions. """ if not options: return cls(default_color=DEFAULT_COLOR, colors={}) RenderOptions._validate(options) return cls( options.get("default_color", DEFAULT_COLOR), options.get("colors", {}) or {} )
[docs] def to_dict(self): """Converts RenderOptions instance to dictionary format. Returns ------- dict The render options in dictionary format. """ return asdict(self)
def __repr__(self) -> str: return self.to_dict()
[docs] class TextLabeledDataFormatter: """The TextLabeledDataFormatter class to render NER items into Html format.""" _ITEM_TEMPLATE = ( '<span style="background-color: $color; padding: 5px; margin: 0px 5px; border-radius: 5px;">' '<span style="margin-right: 5px;">$entity</span>' '<span style="text-transform: uppercase; font-weight: bold; font-size:0.8em;">$label</span>' "</span>" ) _ROW_TEMPLATE = '<div key=$key style="margin-top:10px; line-height:2em">$row</div>'
[docs] @staticmethod def render(items: List[LabeledTextItem], options: Dict = None) -> str: """Renders NER dataset to Html format. Parameters ---------- items: List[LabeledTextItem] Items to render. options: Optional[dict] Render options. Returns ------- str Html representation of rendered NER dataset. Raises ------ ValueError If items not provided. TypeError If items provided in a wrong format. """ if not items: raise ValueError("The parameter `items` is required.") if not isinstance(items, list) or not all( isinstance(x, LabeledTextItem) for x in items ): raise TypeError( "Wrong format for the items. Items should be a `List[LabeledTextItem]`." ) render_options = RenderOptions.from_dict(options) item_template = Template(TextLabeledDataFormatter._ITEM_TEMPLATE) row_template = Template(TextLabeledDataFormatter._ROW_TEMPLATE) result = [] for item_index, item in enumerate(items): current_index = 0 accum = [] for e in item.ents: start = e.offset end = e.offset + e.length label = e.label accum.append(item.txt[current_index:start]) accum.append( item_template.substitute( { "color": render_options.colors.get( label, render_options.default_color ), "entity": item.txt[start:end], "label": label, } ) ) current_index = end accum.append(item.txt[current_index : len(item.txt)]) result.append( row_template.substitute({"key": str(item_index), "row": "".join(accum)}) ) return "".join(result)
def _df_to_ner_items( df: pd.DataFrame, content_column: str = "Content", annotations_column: str = "Annotations", ) -> List[LabeledTextItem]: """Converts pandas dataframe into a list of LabeledTextItem. Parameters ---------- df: pd.DataFrame The Pandas dataframe to convert. content_column: Optional[str] The column name with the content data. annotations_column: Optional[str] The column name for the annotations list. Returns ------- List[LabeledTextItem] The list of LabeledTextItem objects. Raises ------ TypeError If input data not a pandas dataframe. ValueError If input data has wrong format. """ if not isinstance(df, pd.DataFrame): raise TypeError("The parameter `df` must be a Pandas dataframe.") if content_column not in list(df.columns): raise ValueError( f"The parameter `df` is invalid. It must have a column named `{content_column}`." ) if annotations_column not in list(df.columns): raise ValueError( f"The parameter `df` is invalid. It must have a column named `{annotations_column}`." ) if df[content_column].isnull().values.any(): logger.warning( "The parameter `df` includes records where the text content is not " "materialized. These records will be ignored. Use `materialize=True` " "to load the content." ) result = [] for item in df.T.to_dict().values(): if item[annotations_column] and not isinstance(item[annotations_column], list): raise ValueError( f"The parameter `df` is invalid. The column {annotations_column} " "must be of type `List[NERItem]`." ) if item[content_column]: if ( isinstance(item[annotations_column][0], tuple) and len(item[annotations_column][0]) == LEN_OF_SPACY_ITEM ): ents = [NERItem.from_spacy(ent) for ent in item[annotations_column]] else: ents = item[annotations_column] or [] result.append( LabeledTextItem( txt=item[content_column], ents=ents, ) ) return result
[docs] def render(items: List[LabeledTextItem], options: Dict = None) -> str: """Renders NER dataset to Html format. Parameters ---------- items: List[LabeledTextItem] The list of NER items to render. options: dict, optional The options for rendering. Returns ------- str Html string. Examples -------- >>> record = LabeledTextItem("London is the capital of the United Kingdom", [NERItem('city', 0, 6), NERItem("country", 29, 14)]) >>> result = render(items = [record], options={"default_color":"#DDEECC", "colors": {"city":"#DDEECC", "country":"#FFAAAA"}}) >>> display(HTML(result)) """ return TextLabeledDataFormatter.render(items, options)