Source code for ads.data_labeling.reader.record_reader

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from typing import Any, Dict, Generator, List, Tuple, Union

from ads.config import JOB_RUN_COMPARTMENT_OCID, NB_SESSION_COMPARTMENT_OCID
from ads.data_labeling.interface.loader import Loader
from ads.data_labeling.interface.parser import Parser
from ads.data_labeling.interface.reader import Reader
from ads.data_labeling.loader.file_loader import FileLoaderFactory
from ads.data_labeling.parser.dls_record_parser import DLSRecordParserFactory
from ads.data_labeling.parser.export_record_parser import RecordParserFactory
from ads.data_labeling.reader.dls_record_reader import DLSRecordReader
from ads.data_labeling.reader.export_record_reader import ExportRecordReader


[docs]class RecordReader: """Record Reader Class consists of parser, reader and loader. Reader reads the the content from the record file. Parser parses the label for each record. And Loader loads the content of the file path in that record. Examples -------- >>> import os >>> import oci >>> from ads.data_labeling import RecordReader >>> from ads.common import auth as authutil >>> file_path = "/path/to/your_record.jsonl" >>> dataset_type = "IMAGE" >>> annotation_type = "BOUNDING_BOX" >>> record_reader = RecordReader.from_export_file(file_path, dataset_type, annotation_type, "image_file_path", authutil.api_keys()) >>> next(record_reader.read()) """ def __init__( self, reader: Reader, parser: Parser, loader: Loader = None, include_unlabeled: bool = False, encoding: str = "utf-8", materialize: bool = False, ) -> "RecordReader": """Initiates a RecordReader instance. Parameters ---------- reader: Reader Reader instance to read content from the record file. parser: Parser Parser instance to parse the labels from record file. loader: Loader. Defaults to None. Loader instance to load the content from the file path in the record. materialize: bool, optional. Defaults to False. Whether to materialize the content using loader. include_unlabeled: (bool, optional). Default to False. Whether to load the unlabeled records or not. encoding: str, optional Encoding for text files. Used only to extract the content of the text dataset contents. Raises ------ ValueError If the record reader and record parser must be specified. If the loader is not specified when materialize if True. """ if not reader: raise ValueError("The record reader must be specified.") if not parser: raise ValueError("The record parser must be specified.") if materialize and not loader: raise ValueError("The content loader must be specified.") self.reader = reader self.parser = parser self.loader = loader self.materialize = materialize self.include_unlabeled = include_unlabeled self.encoding = encoding
[docs] @classmethod def from_export_file( cls, path: str, dataset_type: str, annotation_type: str, dataset_source_path: str, auth: Dict = None, include_unlabeled: bool = False, encoding: str = "utf-8", materialize: bool = False, format: str = None, categories: List[str] = None, includes_metadata=False, ) -> "RecordReader": """Initiates a RecordReader instance. Parameters ---------- path: str Record file path. dataset_type: str Dataset type. Currently supports TEXT, IMAGE and DOCUMENT. annotation_type: str Annotation Type. Currently TEXT supports SINGLE_LABEL, MULTI_LABEL, ENTITY_EXTRACTION. IMAGE supports SINGLE_LABEL, MULTI_LABEL and BOUNDING_BOX. DOCUMENT supports SINGLE_LABEL and MULTI_LABEL. dataset_source_path: str Dataset source path. auth: (dict, optional). Default None The default authetication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate authentication signer and kwargs required to instantiate IdentityClient object. include_unlabeled: (bool, optional). Default to False. Whether to load the unlabeled records or not. encoding : (str, optional). Defaults to "utf-8". Encoding for text files. Used only to extract the content of the text dataset contents. materialize: (bool, optional). Defaults to False. Whether to materialize the content by loader. format: (str, optional). Defaults to None. Output format of annotations. Can be None, "spacy" for dataset Entity Extraction type or "yolo" for Object Detection type. When None, it outputs List[NERItem] or List[BoundingBoxItem]. When "spacy", it outputs List[Tuple]. When "yolo", it outputs List[List[Tuple]]. categories: (List[str], optional). Defaults to None. The list of object categories in proper order for model training. Example: ['cat','dog','horse'] includes_metadata: (bool, optional). Defaults to False. Determines whether the export file includes metadata or not. Returns ------- RecordReader A RecordReader instance. """ reader = ExportRecordReader( path=path, auth=auth, encoding="utf-8", includes_metadata=includes_metadata ) parser = RecordParserFactory.parser( annotation_type=annotation_type, dataset_source_path=dataset_source_path, format=format, categories=categories, ) loader = FileLoaderFactory.loader( dataset_type=dataset_type, auth=auth, ) return cls( reader=reader, parser=parser, loader=loader, materialize=materialize, include_unlabeled=include_unlabeled, encoding=encoding, )
[docs] @classmethod def from_DLS( cls, dataset_id: str, dataset_type: str, annotation_type: str, dataset_source_path: str, compartment_id: str = None, auth: Dict = None, include_unlabeled: bool = False, encoding: str = "utf-8", materialize: bool = False, format: str = None, categories: List[str] = None, ) -> "RecordReader": """Constructs Record Reader instance. Parameters ---------- dataset_id: str The dataset OCID. dataset_type: str Dataset type. Currently supports TEXT, IMAGE and DOCUMENT. annotation_type: str Annotation Type. Currently TEXT supports SINGLE_LABEL, MULTI_LABEL, ENTITY_EXTRACTION. IMAGE supports SINGLE_LABEL, MULTI_LABEL and BOUNDING_BOX. DOCUMENT supports SINGLE_LABEL and MULTI_LABEL. dataset_source_path: str Dataset source path. compartment_id: (str, optional). Defaults to None. The compartment OCID of the dataset. auth: (dict, optional). Defaults to None. The default authetication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate authentication signer and kwargs required to instantiate IdentityClient object. encoding: (str, optional). Defaults to 'utf-8'. Encoding for files. materialize: (bool, optional). Defaults to False. Whether the content of the dataset file should be loaded or it should return the file path to the content. By default the content will not be loaded. format: (str, optional). Defaults to None. Output format of annotations. Can be None, "spacy" for dataset Entity Extraction type or "yolo" for Object Detection type. When None, it outputs List[NERItem] or List[BoundingBoxItem]. When "spacy", it outputs List[Tuple]. When "yolo", it outputs List[List[Tuple]]. categories: (List[str], optional). Defaults to None. The list of object categories in proper order for model training. Example: ['cat','dog','horse'] Returns ------- RecordReader The RecordReader instance. """ if compartment_id is None: compartment_id = NB_SESSION_COMPARTMENT_OCID or JOB_RUN_COMPARTMENT_OCID if not compartment_id: raise ValueError("The `compartment_id` must be provided.") reader = DLSRecordReader( compartment_id=compartment_id, dataset_id=dataset_id, auth=auth ) parser = DLSRecordParserFactory.parser( annotation_type=annotation_type, dataset_source_path=dataset_source_path, format=format, categories=categories, ) loader = FileLoaderFactory.loader( dataset_type=dataset_type, auth=auth, ) return cls( reader=reader, parser=parser, loader=loader, materialize=materialize, include_unlabeled=include_unlabeled, encoding=encoding, )
[docs] def read(self) -> Generator[Tuple[str, Union[List, str]], Any, Any]: """Reads the record. Yields ------ Generator[Tuple[str, Union[List, str]], Any, Any] File path, content and labels in a tuple. """ for item in self.reader.read(): if not item: return None record = self.parser.parse(item) if record.annotation or self.include_unlabeled: if self.materialize: record.content = self.loader.load( record.path, **{"encoding": self.encoding} ) yield record.to_tuple()