Source code for ads.data_labeling.parser.export_record_parser

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import os
from abc import abstractmethod
from typing import Dict, List, Union

from ads.data_labeling.boundingbox import BoundingBoxItem
from ads.data_labeling.constants import AnnotationType, Formats
from ads.data_labeling.interface.parser import Parser
from ads.data_labeling.ner import NERItem
from ads.data_labeling.record import Record

DATASET_RECORD_INVALID_MESSAGE = "The dataset record file is invalid. "


[docs] class EntityType: """Entity type class for supporting multiple types of entities.""" GENERIC = "GENERIC" TEXTSELECTION = "TEXTSELECTION" IMAGEOBJECTSELECTION = "IMAGEOBJECTSELECTION"
[docs] class RecordParser(Parser): """RecordParser class which parses the labels from the record. Examples -------- >>> from ads.data_labeling.parser.export_record_parser import SingleLabelRecordParser >>> from ads.data_labeling.parser.export_record_parser import MultiLabelRecordParser >>> from ads.data_labeling.parser.export_record_parser import NERRecordParser >>> from ads.data_labeling.parser.export_record_parser import BoundingBoxRecordParser >>> import fsspec >>> import json >>> from ads.common import auth as authutil >>> labels = [] >>> with fsspec.open("/path/to/records_file.jsonl", **authutil.api_keys()) as f: >>> for line in f: >>> bounding_box_labels = BoundingBoxRecordParser("source_data_path").parse(json.loads(line)) >>> labels.append(bounding_box_labels) """ def __init__( self, dataset_source_path: str, format: str = None, categories: List[str] = None ) -> "RecordParser": """Initiates a RecordParser instance. Parameters ---------- dataset_source_path: str Dataset source path. format: (str, optional). Defaults to None. Output format of annotations. categories: (List[str], optional). Defaults to None. The list of object categories in proper order for model training. Example: ['cat','dog','horse'] Returns ------- RecordParser RecordParser instance. """ self.dataset_source_path = dataset_source_path self.format = format self.categories = categories
[docs] def parse(self, record: Dict) -> "Record": """Extracts the annotations from the record content. Constructs and returns a Record instance containing the file path and the labels. Parameters ---------- record: Dict Content of the record from the record file. Returns ------- Record Record instance which contains the file path as well as the annotations. """ return Record( path=self.dataset_source_path + record["sourceDetails"]["path"], annotation=self._extract_annotations( record=record, format=self.format, categories=self.categories ), )
@abstractmethod def _extract_annotations( self, record: Dict, **kwargs, ) -> Union[str, List[str], List[BoundingBoxItem], List[NERItem]]: """Extracts annotations from the record content. Each Parser class needs to implement this function. Parameters ---------- record: Dict Content of the record from the record file. kwargs: Dict format: str Output format of annotations. Can be "spacy" or "yolo". Returns ------- Union[str, List[str], List[BoundingBoxItem], List[NERItem]] Label(s). """ pass def _validate(self, record: Dict) -> None: """Validates the record to ensure it contains certain fields. Raises ------ ValueError If record format is incorrect. """ if ( "annotations" not in record or not isinstance(record["annotations"], list) or "entities" not in record["annotations"][0] or not isinstance(record["annotations"][0]["entities"], list) or "entityType" not in record["annotations"][0]["entities"][0] or "labels" not in record["annotations"][0]["entities"][0] ): raise ValueError( f"{DATASET_RECORD_INVALID_MESSAGE}" "At least one record is in the wrong format. " "Use the `DataLabeling.export()` method to create a new dataset record file." )
[docs] class SingleLabelRecordParser(RecordParser): """SingleLabelRecordParser class which parses the label of Single label data.""" def _extract_annotations(self, record: Dict, **kwargs) -> Union[str, None]: """Extract the labels of the single label annotation class. Parameters ---------- record: Dict Content of the record from the record file. Returns ------- Union[str, None] A label or None for the unlabeled record. """ if "annotations" in record: self._validate(record) return record["annotations"][0]["entities"][0]["labels"][0]["label_name"] else: return None def _validate(self, record: Dict) -> None: """Validates the format of the single label record. Raises ------ ValueError If record format is incorrect. """ super()._validate(record) if record["annotations"][0]["entities"][0]["entityType"] != EntityType.GENERIC: raise ValueError( f"{DATASET_RECORD_INVALID_MESSAGE}" "At least one record contains the invalid entity type:  " f"`{record['annotations'][0]['entities'][0]['entityType']}`. The entity " f"type of the Single Label annotation must be `{EntityType.GENERIC}`. " "Use the `DataLabeling.export()` method to create a new dataset record file." ) if len(record["annotations"][0]["entities"][0]["labels"]) != 1: raise ValueError( f"{DATASET_RECORD_INVALID_MESSAGE}" "At least one record contains an invalid number of records: " f"{len(record['annotations'][0]['entities'][0]['labels'])}. " "The Single Label annotation expects only one label for each record. " "Use the `DataLabeling.export()` method to create a new dataset record file." )
[docs] class MultiLabelRecordParser(RecordParser): """MultiLabelRecordParser class which parses the label of Multiple label data.""" def _extract_annotations(self, record: Dict, **kwargs) -> Union[List[str], None]: """Extract labels of the Multi label annotation class. Parameters ---------- record: Dict Content of the record from the record file. Returns ------- Union[List[str], None] List of labels or None for the unlabeled record. """ if "annotations" in record: self._validate(record) return [ label["label_name"] for label in record["annotations"][0]["entities"][0]["labels"] ] else: return None def _validate(self, record: Dict) -> None: """Validates the format of the multi label record. Raises ------ ValueError If record format is incorrect. """ super()._validate(record) if record["annotations"][0]["entities"][0]["entityType"] != EntityType.GENERIC: raise ValueError( f"At least one of the dataset records contains the invalid entity type: " f"`{record['annotations'][0]['entities'][0]['entityType']}`. " f"The entity type of the Multi Label annotation must be `{EntityType.GENERIC}`." ) if len(record["annotations"][0]["entities"][0]["labels"]) < 1: raise ValueError( f"At least one of the dataset records contains invalid number of labels: " f"`{len(record['annotations'][0]['entities'][0]['labels'])}`. " f"The Multi Label annotation expects at least one label for each record." )
[docs] class NERRecordParser(RecordParser): """NERRecordParser class which parses the label of NER label data.""" def _extract_annotations( self, record: Dict, **kwargs ) -> Union[List[NERItem], None]: """Extracts the labels of the NER annotation class. Parameters ---------- record: Dict Content of the record from the record file. kwargs: Dict format: str Output format of annotations. Can be "spacy" or None. When None, it outputs List[NERItem]. When "spacy", it outputs List[Tuple]. Returns ------- Union[List[NERItem], List[Tuple], None] The list of NERItem objects or list of tuples in spacy format. """ if "annotations" in record: self._validate(record) format = kwargs.get("format", None) items = [] for entity in record["annotations"][0]["entities"]: label = entity["labels"][0]["label_name"] offset = entity["textSpan"]["offset"] length = entity["textSpan"]["length"] item = NERItem(label=label, offset=offset, length=length) if ( format and isinstance(format, str) and format.lower() == Formats.SPACY ): item = item.to_spacy() items.append(item) return items else: return None def _validate(self, record: Dict) -> None: """Validates the format of the NER label record. Raises ------ ValueError If record format is incorrect. """ super()._validate(record) if ( record["annotations"][0]["entities"][0]["entityType"] != EntityType.TEXTSELECTION ): raise ValueError( f"{DATASET_RECORD_INVALID_MESSAGE}" "At least one record contains the invalid entity type:  " f"`{record['annotations'][0]['entities'][0]['entityType']}`. The entity type " f"of the Single Label annotation must be `{EntityType.TEXTSELECTION}`. " "Use the `DataLabeling.export()` method to create a new dataset record file." ) if os.path.splitext(record["sourceDetails"]["path"])[1].lower() != ".txt": raise ValueError( f"The file ({record['sourceDetails']['path']}) must be a text file and have a '.txt' file extension." )
[docs] class BoundingBoxRecordParser(RecordParser): """BoundingBoxRecordParser class which parses the label of BoundingBox label data.""" def _extract_annotations( self, record: Dict, **kwargs: Dict ) -> Union[List[BoundingBoxItem], None]: """Extracts the labels of the Object Detection annotation class. Parameters ---------- record: Dict Content of the record from the record file. kwargs: Dict format: str Output format of annotations. Can be None or "yolo". When None, it outputs List[BoundingBoxItem]. When "yolo", it outputs List[List[Tuple]]. categories: Optional List[str] The list of object categories in proper order for model training. Only used when bounding box annotations are in YOLO format. Returns ------- Union[List[BoundingBoxItem], List[List[Tuple]], None] The list of BoundingBoxItem objects or list of tuples in YOLO format. """ if not "annotations" in record: return None self._validate(record) format = kwargs.get("format", None) categories = kwargs.get("categories", None) items = [] for entity in record["annotations"][0]["entities"]: labels = [label["label_name"] for label in entity["labels"]] coords = entity["boundingPolygon"]["normalizedVertices"] top_left = (float(coords[0]["x"]), float(coords[0]["y"])) bottom_left = (float(coords[1]["x"]), float(coords[1]["y"])) bottom_right = (float(coords[2]["x"]), float(coords[2]["y"])) top_right = (float(coords[3]["x"]), float(coords[3]["y"])) item = BoundingBoxItem( labels=labels, bottom_left=bottom_left, top_left=top_left, top_right=top_right, bottom_right=bottom_right, ) if format and isinstance(format, str) and format.lower() == Formats.YOLO: item = item.to_yolo(categories=categories) items.append(item) return items def _validate(self, record: Dict) -> None: """Validates the format of the image label record. Raises ------ ValueError If record format is incorrect. """ super()._validate(record) if ( record["annotations"][0]["entities"][0]["entityType"] != EntityType.IMAGEOBJECTSELECTION ): raise ValueError( f"{DATASET_RECORD_INVALID_MESSAGE}" "At least one record contains the invalid entity type:  " f"`{record['annotations'][0]['entities'][0]['entityType']}`. The entity type " f"of the Single Label annotation must be `{EntityType.IMAGEOBJECTSELECTION}`. " "Use the `DataLabeling.export()` method to create a new dataset record file." ) if os.path.splitext(record["sourceDetails"]["path"])[1].lower() not in [ ".jpg", ".png", ".jpeg", ]: raise ValueError( f"The file ({record['sourceDetails']['path']}) must be a jpg, jpeg or png file." )
[docs] class RecordParserFactory: """RecordParserFactory class which contains a list of registered parsers and allows to register new RecordParsers. Current parsers include: * SingleLabelRecordParser * MultiLabelRecordParser * NERRecordParser * BoundingBoxRecordParser """ _parsers = { AnnotationType.SINGLE_LABEL: SingleLabelRecordParser, AnnotationType.MULTI_LABEL: MultiLabelRecordParser, AnnotationType.ENTITY_EXTRACTION: NERRecordParser, AnnotationType.BOUNDING_BOX: BoundingBoxRecordParser, }
[docs] @staticmethod def parser( annotation_type: str, dataset_source_path: str, format: str = None, categories: List[str] = None, ) -> "RecordParser": """Gets the parser based on the annotation_type. Parameters ---------- annotation_type: str Annotation type which can be SINGLE_LABEL, MULTI_LABEL, ENTITY_EXTRACTION and BOUNDING_BOX. dataset_source_path: str Dataset source path. format: (str, optional). Defaults to None. Output format of annotations. Can be None, "spacy" for dataset Entity Extraction type or "yolo" for Object Detection type. When None, it outputs List[NERItem] or List[BoundingBoxItem]. When "spacy", it outputs List[Tuple]. When "yolo", it outputs List[List[Tuple]]. categories: (List[str], optional). Defaults to None. The list of object categories in proper order for model training. Example: ['cat','dog','horse'] Returns ------- RecordParser RecordParser corresponding to the annotation type. Raises ------ ValueError If annotation_type is not supported. """ if not annotation_type in RecordParserFactory._parsers: raise ValueError( f"The {annotation_type} is not supported. Choose from " f"`{AnnotationType.SINGLE_LABEL}`, `{AnnotationType.MULTI_LABEL}`, " f"`{AnnotationType.ENTITY_EXTRACTION}` and `{AnnotationType.BOUNDING_BOX}`." ) return RecordParserFactory._parsers[annotation_type]( dataset_source_path=dataset_source_path, format=format, categories=categories, )
[docs] @classmethod def register(cls, annotation_type: str, parser) -> None: """Registers a new parser. Parameters ---------- annotation_type: str Annotation type which can be SINGLE_LABEL, MULTI_LABEL, ENTITY_EXTRACTION and BOUNDING_BOX. parser: RecordParser A new Parser class to be registered. Returns ------- None Nothing. """ cls._parsers[annotation_type] = parser