Source code for ads.data_labeling.parser.export_record_parser

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import os
from abc import abstractmethod
from typing import Dict, List, Union

from ads.data_labeling.boundingbox import BoundingBoxItem
from ads.data_labeling.constants import AnnotationType, Formats
from ads.data_labeling.interface.parser import Parser
from ads.data_labeling.ner import NERItem
from ads.data_labeling.record import Record

DATASET_RECORD_INVALID_MESSAGE = "The dataset record file is invalid. "



[docs]
class EntityType:
    """Entity type class for supporting multiple types of entities."""

    GENERIC = "GENERIC"
    TEXTSELECTION = "TEXTSELECTION"
    IMAGEOBJECTSELECTION = "IMAGEOBJECTSELECTION"




[docs]
class RecordParser(Parser):
    """RecordParser class which parses the labels from the record.

    Examples
    --------
    >>> from ads.data_labeling.parser.export_record_parser import SingleLabelRecordParser
    >>> from ads.data_labeling.parser.export_record_parser import MultiLabelRecordParser
    >>> from ads.data_labeling.parser.export_record_parser import NERRecordParser
    >>> from ads.data_labeling.parser.export_record_parser import BoundingBoxRecordParser
    >>> import fsspec
    >>> import json
    >>> from ads.common import auth as authutil
    >>> labels = []
    >>> with fsspec.open("/path/to/records_file.jsonl", **authutil.api_keys()) as f:
    >>>     for line in f:
    >>>         bounding_box_labels = BoundingBoxRecordParser("source_data_path").parse(json.loads(line))
    >>>         labels.append(bounding_box_labels)
    """

    def __init__(
        self, dataset_source_path: str, format: str = None, categories: List[str] = None
    ) -> "RecordParser":
        """Initiates a RecordParser instance.

        Parameters
        ----------
        dataset_source_path: str
            Dataset source path.
        format: (str, optional). Defaults to None.
            Output format of annotations.
        categories: (List[str], optional). Defaults to None.
            The list of object categories in proper order for model training.
            Example: ['cat','dog','horse']

        Returns
        -------
        RecordParser
            RecordParser instance.
        """
        self.dataset_source_path = dataset_source_path
        self.format = format
        self.categories = categories


[docs]
    def parse(self, record: Dict) -> "Record":
        """Extracts the annotations from the record content.
        Constructs and returns a Record instance containing the file path and the labels.

        Parameters
        ----------
        record: Dict
            Content of the record from the record file.

        Returns
        -------
        Record
            Record instance which contains the file path as well as the annotations.
        """
        return Record(
            path=self.dataset_source_path + record["sourceDetails"]["path"],
            annotation=self._extract_annotations(
                record=record, format=self.format, categories=self.categories
            ),
        )


    @abstractmethod
    def _extract_annotations(
        self,
        record: Dict,
        **kwargs,
    ) -> Union[str, List[str], List[BoundingBoxItem], List[NERItem]]:
        """Extracts annotations from the record content. Each Parser class
        needs to implement this function.

        Parameters
        ----------
        record: Dict
            Content of the record from the record file.
        kwargs: Dict
            format: str
                Output format of annotations. Can be "spacy" or "yolo".

        Returns
        -------
        Union[str, List[str], List[BoundingBoxItem], List[NERItem]]
            Label(s).
        """
        pass

    def _validate(self, record: Dict) -> None:
        """Validates the record to ensure it contains certain fields.

        Raises
        ------
        ValueError
            If record format is incorrect.
        """
        if (
            "annotations" not in record
            or not isinstance(record["annotations"], list)
            or "entities" not in record["annotations"][0]
            or not isinstance(record["annotations"][0]["entities"], list)
            or "entityType" not in record["annotations"][0]["entities"][0]
            or "labels" not in record["annotations"][0]["entities"][0]
        ):
            raise ValueError(
                f"{DATASET_RECORD_INVALID_MESSAGE}"
                "At least one record is in the wrong format. "
                "Use the `DataLabeling.export()` method to create a new dataset record file."
            )




[docs]
class SingleLabelRecordParser(RecordParser):
    """SingleLabelRecordParser class which parses the label of Single label data."""

    def _extract_annotations(self, record: Dict, **kwargs) -> Union[str, None]:
        """Extract the labels of the single label annotation class.

        Parameters
        ----------
        record: Dict
            Content of the record from the record file.

        Returns
        -------
        Union[str, None]
            A label or None for the unlabeled record.
        """
        if "annotations" in record:
            self._validate(record)
            return record["annotations"][0]["entities"][0]["labels"][0]["label_name"]
        else:
            return None

    def _validate(self, record: Dict) -> None:
        """Validates the format of the single label record.

        Raises
        ------
        ValueError
            If record format is incorrect.
        """
        super()._validate(record)
        if record["annotations"][0]["entities"][0]["entityType"] != EntityType.GENERIC:
            raise ValueError(
                f"{DATASET_RECORD_INVALID_MESSAGE}"
                "At least one record contains the invalid entity type:  "
                f"`{record['annotations'][0]['entities'][0]['entityType']}`. The entity "
                f"type of the Single Label annotation must be `{EntityType.GENERIC}`. "
                "Use the `DataLabeling.export()` method to create a new dataset record file."
            )
        if len(record["annotations"][0]["entities"][0]["labels"]) != 1:
            raise ValueError(
                f"{DATASET_RECORD_INVALID_MESSAGE}"
                "At least one record contains an invalid number of records: "
                f"{len(record['annotations'][0]['entities'][0]['labels'])}. "
                "The Single Label annotation expects only one label for each record. "
                "Use the `DataLabeling.export()` method to create a new dataset record file."
            )




[docs]
class MultiLabelRecordParser(RecordParser):
    """MultiLabelRecordParser class which parses the label of Multiple label data."""

    def _extract_annotations(self, record: Dict, **kwargs) -> Union[List[str], None]:
        """Extract labels of the Multi label annotation class.

        Parameters
        ----------
        record: Dict
            Content of the record from the record file.

        Returns
        -------
        Union[List[str], None]
            List of labels or None for the unlabeled record.
        """
        if "annotations" in record:
            self._validate(record)
            return [
                label["label_name"]
                for label in record["annotations"][0]["entities"][0]["labels"]
            ]
        else:
            return None

    def _validate(self, record: Dict) -> None:
        """Validates the format of the multi label record.

        Raises
        ------
        ValueError
            If record format is incorrect.
        """
        super()._validate(record)
        if record["annotations"][0]["entities"][0]["entityType"] != EntityType.GENERIC:
            raise ValueError(
                f"At least one of the dataset records contains the invalid entity type: "
                f"`{record['annotations'][0]['entities'][0]['entityType']}`. "
                f"The entity type of the Multi Label annotation must be `{EntityType.GENERIC}`."
            )
        if len(record["annotations"][0]["entities"][0]["labels"]) < 1:
            raise ValueError(
                f"At least one of the dataset records contains invalid number of labels: "
                f"`{len(record['annotations'][0]['entities'][0]['labels'])}`. "
                f"The Multi Label annotation expects at least one label for each record."
            )




[docs]
class NERRecordParser(RecordParser):
    """NERRecordParser class which parses the label of NER label data."""

    def _extract_annotations(
        self, record: Dict, **kwargs
    ) -> Union[List[NERItem], None]:
        """Extracts the labels of the NER annotation class.

        Parameters
        ----------
        record: Dict
            Content of the record from the record file.
        kwargs: Dict
            format: str
                Output format of annotations. Can be "spacy" or None.
                When None, it outputs List[NERItem]. When "spacy", it
                outputs List[Tuple].

        Returns
        -------
        Union[List[NERItem], List[Tuple], None]
            The list of NERItem objects or list of tuples in spacy format.
        """
        if "annotations" in record:
            self._validate(record)
            format = kwargs.get("format", None)
            items = []
            for entity in record["annotations"][0]["entities"]:
                label = entity["labels"][0]["label_name"]
                offset = entity["textSpan"]["offset"]
                length = entity["textSpan"]["length"]
                item = NERItem(label=label, offset=offset, length=length)
                if (
                    format
                    and isinstance(format, str)
                    and format.lower() == Formats.SPACY
                ):
                    item = item.to_spacy()
                items.append(item)
            return items
        else:
            return None

    def _validate(self, record: Dict) -> None:
        """Validates the format of the NER label record.

        Raises
        ------
        ValueError
            If record format is incorrect.
        """
        super()._validate(record)
        if (
            record["annotations"][0]["entities"][0]["entityType"]
            != EntityType.TEXTSELECTION
        ):
            raise ValueError(
                f"{DATASET_RECORD_INVALID_MESSAGE}"
                "At least one record contains the invalid entity type:  "
                f"`{record['annotations'][0]['entities'][0]['entityType']}`. The entity type "
                f"of the Single Label annotation must be `{EntityType.TEXTSELECTION}`. "
                "Use the `DataLabeling.export()` method to create a new dataset record file."
            )
        if os.path.splitext(record["sourceDetails"]["path"])[1].lower() != ".txt":
            raise ValueError(
                f"The file ({record['sourceDetails']['path']}) must be a text file and have a '.txt' file extension."
            )




[docs]
class BoundingBoxRecordParser(RecordParser):
    """BoundingBoxRecordParser class which parses the label of BoundingBox label data."""

    def _extract_annotations(
        self, record: Dict, **kwargs: Dict
    ) -> Union[List[BoundingBoxItem], None]:
        """Extracts the labels of the Object Detection annotation class.

        Parameters
        ----------
        record: Dict
            Content of the record from the record file.
        kwargs: Dict
            format: str
                Output format of annotations. Can be None or "yolo".
                When None, it outputs List[BoundingBoxItem]. When "yolo", it
                outputs List[List[Tuple]].
            categories: Optional List[str]
                 The list of object categories in proper order for model training.
                 Only used when bounding box annotations are in YOLO format.

        Returns
        -------
        Union[List[BoundingBoxItem], List[List[Tuple]], None]
            The list of BoundingBoxItem objects or list of tuples in YOLO format.
        """
        if not "annotations" in record:
            return None

        self._validate(record)
        format = kwargs.get("format", None)
        categories = kwargs.get("categories", None)
        items = []
        for entity in record["annotations"][0]["entities"]:
            labels = [label["label_name"] for label in entity["labels"]]
            coords = entity["boundingPolygon"]["normalizedVertices"]
            top_left = (float(coords[0]["x"]), float(coords[0]["y"]))
            bottom_left = (float(coords[1]["x"]), float(coords[1]["y"]))
            bottom_right = (float(coords[2]["x"]), float(coords[2]["y"]))
            top_right = (float(coords[3]["x"]), float(coords[3]["y"]))
            item = BoundingBoxItem(
                labels=labels,
                bottom_left=bottom_left,
                top_left=top_left,
                top_right=top_right,
                bottom_right=bottom_right,
            )
            if format and isinstance(format, str) and format.lower() == Formats.YOLO:
                item = item.to_yolo(categories=categories)
            items.append(item)

        return items

    def _validate(self, record: Dict) -> None:
        """Validates the format of the image label record.

        Raises
        ------
        ValueError
            If record format is incorrect.
        """
        super()._validate(record)
        if (
            record["annotations"][0]["entities"][0]["entityType"]
            != EntityType.IMAGEOBJECTSELECTION
        ):
            raise ValueError(
                f"{DATASET_RECORD_INVALID_MESSAGE}"
                "At least one record contains the invalid entity type:  "
                f"`{record['annotations'][0]['entities'][0]['entityType']}`. The entity type "
                f"of the Single Label annotation must be `{EntityType.IMAGEOBJECTSELECTION}`. "
                "Use the `DataLabeling.export()` method to create a new dataset record file."
            )
        if os.path.splitext(record["sourceDetails"]["path"])[1].lower() not in [
            ".jpg",
            ".png",
            ".jpeg",
        ]:
            raise ValueError(
                f"The file ({record['sourceDetails']['path']}) must be a jpg, jpeg or png file."
            )




[docs]
class RecordParserFactory:
    """RecordParserFactory class which contains a list of registered parsers
    and allows to register new RecordParsers.

    Current parsers include:
        * SingleLabelRecordParser
        * MultiLabelRecordParser
        * NERRecordParser
        * BoundingBoxRecordParser
    """

    _parsers = {
        AnnotationType.SINGLE_LABEL: SingleLabelRecordParser,
        AnnotationType.MULTI_LABEL: MultiLabelRecordParser,
        AnnotationType.ENTITY_EXTRACTION: NERRecordParser,
        AnnotationType.BOUNDING_BOX: BoundingBoxRecordParser,
    }


[docs]
    @staticmethod
    def parser(
        annotation_type: str,
        dataset_source_path: str,
        format: str = None,
        categories: List[str] = None,
    ) -> "RecordParser":
        """Gets the parser based on the annotation_type.

        Parameters
        ----------
        annotation_type: str
            Annotation type which can be SINGLE_LABEL, MULTI_LABEL, ENTITY_EXTRACTION
            and BOUNDING_BOX.
        dataset_source_path: str
            Dataset source path.
        format: (str, optional). Defaults to None.
            Output format of annotations. Can be None, "spacy" for dataset
            Entity Extraction type or "yolo" for Object Detection type.
            When None, it outputs List[NERItem] or List[BoundingBoxItem].
            When "spacy", it outputs List[Tuple].
            When "yolo", it outputs List[List[Tuple]].
        categories: (List[str], optional). Defaults to None.
            The list of object categories in proper order for model training.
            Example: ['cat','dog','horse']

        Returns
        -------
        RecordParser
            RecordParser corresponding to the annotation type.

        Raises
        ------
        ValueError
            If annotation_type is not supported.
        """

        if not annotation_type in RecordParserFactory._parsers:
            raise ValueError(
                f"The {annotation_type} is not supported. Choose from "
                f"`{AnnotationType.SINGLE_LABEL}`, `{AnnotationType.MULTI_LABEL}`, "
                f"`{AnnotationType.ENTITY_EXTRACTION}` and `{AnnotationType.BOUNDING_BOX}`."
            )

        return RecordParserFactory._parsers[annotation_type](
            dataset_source_path=dataset_source_path,
            format=format,
            categories=categories,
        )



[docs]
    @classmethod
    def register(cls, annotation_type: str, parser) -> None:
        """Registers a new parser.

        Parameters
        ----------
        annotation_type: str
            Annotation type which can be SINGLE_LABEL, MULTI_LABEL, ENTITY_EXTRACTION
            and BOUNDING_BOX.
        parser: RecordParser
            A new Parser class to be registered.

        Returns
        -------
        None
            Nothing.
        """
        cls._parsers[annotation_type] = parser