Source code for ads.data_labeling.ner
#!/usr/bin/env python
# -*- coding: utf-8; -*-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
from dataclasses import dataclass, field
from typing import List
[docs]
class WrongEntityFormatLabelNotString(ValueError):
def __init__(self):
super().__init__("Wrong entity format. Label is not a String.")
[docs]
class WrongEntityFormatOffsetNotInteger(ValueError):
def __init__(self):
super().__init__("Wrong entity format. Offset is not an Integer.")
[docs]
class WrongEntityFormatLengthNotInteger(ValueError):
def __init__(self):
super().__init__("Wrong entity format. Length is not an Integer.")
[docs]
class WrongEntityFormatOffsetIsNegative(ValueError):
def __init__(self):
super().__init__("Wrong entity format. Offset is not a Nonnegative.")
[docs]
class WrongEntityFormatLengthIsNegative(ValueError):
def __init__(self):
super().__init__("Wrong entity format. Length is not a Nonnegative.")
[docs]
class WrongEntityFormatLabelIsEmpty(ValueError):
def __init__(self):
super().__init__("Wrong entity format. Label is empty.")
[docs]
@dataclass
class NERItem:
"""NERItem class which is a representation of a token span.
Attributes
----------
label: str
Entity name.
offset: int
The token span's entity start index position in the text.
length: int
Length of the token span.
"""
label: str = ""
offset: int = 0
length: int = 0
def _validate(self):
"""Validates the instance.
Raises
------
WrongEntityFormat
If the entity has a wrong format.
"""
if not isinstance(self.label, str):
raise WrongEntityFormatLabelNotString()
if not (
isinstance(self.offset, int)
or (isinstance(self.offset, float) and self.offset.is_integer())
):
raise WrongEntityFormatOffsetNotInteger()
if not (
isinstance(self.length, int)
or (isinstance(self.length, float) and self.length.is_integer())
):
raise WrongEntityFormatLengthNotInteger()
if self.offset < 0:
raise WrongEntityFormatOffsetIsNegative()
if self.length < 0:
raise WrongEntityFormatLengthIsNegative()
if self.label == "":
raise WrongEntityFormatLabelIsEmpty()
def __post_init__(self):
self._validate()
[docs]
def to_spacy(self) -> tuple:
"""Converts one NERItem to the spacy format.
Returns
-------
Tuple
NERItem in the spacy format
"""
return (self.offset, self.offset + self.length, self.label)
[docs]
@classmethod
def from_spacy(cls, token) -> "NERItem":
offset, end, label = token
return cls(label, offset, end - offset)
[docs]
@dataclass
class NERItems:
"""NERItems class consists of a list of NERItem.
Attributes
----------
items: List[NERItem]
List of NERItem.
"""
items: List[NERItem] = field(default_factory=list)
def __getitem__(self, index: int) -> NERItem:
return self.items[index]
[docs]
def to_spacy(self) -> List[tuple]:
"""Converts NERItems to the spacy format.
Returns
-------
List[tuple]
List of NERItems in the Spacy format.
"""
return [item.to_spacy() for item in self.items]