Source code for ads.data_labeling.reader.record_reader

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from typing import Any, Dict, Generator, List, Tuple, Union

from ads.config import JOB_RUN_COMPARTMENT_OCID, NB_SESSION_COMPARTMENT_OCID
from ads.data_labeling.interface.loader import Loader
from ads.data_labeling.interface.parser import Parser
from ads.data_labeling.interface.reader import Reader
from ads.data_labeling.loader.file_loader import FileLoaderFactory
from ads.data_labeling.parser.dls_record_parser import DLSRecordParserFactory
from ads.data_labeling.parser.export_record_parser import RecordParserFactory
from ads.data_labeling.reader.dls_record_reader import DLSRecordReader
from ads.data_labeling.reader.export_record_reader import ExportRecordReader


[docs]class RecordReader:
    """Record Reader Class consists of parser, reader and loader. Reader reads the
    the content from the record file. Parser parses the label for each record. And
    Loader loads the content of the file path in that record.

    Examples
    --------
    >>> import os
    >>> import oci
    >>> from ads.data_labeling import RecordReader
    >>> from ads.common import auth as authutil
    >>> file_path = "/path/to/your_record.jsonl"
    >>> dataset_type = "IMAGE"
    >>> annotation_type = "BOUNDING_BOX"
    >>> record_reader = RecordReader.from_export_file(file_path, dataset_type, annotation_type, "image_file_path", authutil.api_keys())
    >>> next(record_reader.read())
    """

    def __init__(
        self,
        reader: Reader,
        parser: Parser,
        loader: Loader = None,
        include_unlabeled: bool = False,
        encoding: str = "utf-8",
        materialize: bool = False,
    ) -> "RecordReader":
        """Initiates a RecordReader instance.

        Parameters
        ----------
        reader: Reader
            Reader instance to read content from the record file.
        parser: Parser
            Parser instance to parse the labels from record file.
        loader: Loader. Defaults to None.
            Loader instance to load the content from the file path in the record.
        materialize: bool, optional. Defaults to False.
            Whether to materialize the content using loader.
        include_unlabeled: (bool, optional). Default to False.
            Whether to load the unlabeled records or not.
        encoding: str, optional
            Encoding for text files. Used only to extract the content of the text dataset contents.

        Raises
        ------
        ValueError
            If the record reader and record parser must be specified.
            If the loader is not specified when materialize if True.
        """
        if not reader:
            raise ValueError("The record reader must be specified.")
        if not parser:
            raise ValueError("The record parser must be specified.")
        if materialize and not loader:
            raise ValueError("The content loader must be specified.")

        self.reader = reader
        self.parser = parser
        self.loader = loader
        self.materialize = materialize
        self.include_unlabeled = include_unlabeled
        self.encoding = encoding

[docs]    @classmethod
    def from_export_file(
        cls,
        path: str,
        dataset_type: str,
        annotation_type: str,
        dataset_source_path: str,
        auth: Dict = None,
        include_unlabeled: bool = False,
        encoding: str = "utf-8",
        materialize: bool = False,
        format: str = None,
        categories: List[str] = None,
        includes_metadata=False,
    ) -> "RecordReader":
        """Initiates a RecordReader instance.

        Parameters
        ----------
        path: str
            Record file path.
        dataset_type: str
            Dataset type. Currently supports TEXT, IMAGE and DOCUMENT.
        annotation_type: str
            Annotation Type. Currently TEXT supports SINGLE_LABEL, MULTI_LABEL,
            ENTITY_EXTRACTION. IMAGE supports SINGLE_LABEL, MULTI_LABEL and BOUNDING_BOX.
            DOCUMENT supports SINGLE_LABEL and MULTI_LABEL.
        dataset_source_path: str
            Dataset source path.
        auth: (dict, optional). Default None
            The default authetication is set using `ads.set_auth` API. If you need to override the
            default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
            authentication signer and kwargs required to instantiate IdentityClient object.
        include_unlabeled: (bool, optional). Default to False.
            Whether to load the unlabeled records or not.
        encoding : (str, optional). Defaults to "utf-8".
            Encoding for text files. Used only to extract the content of the text dataset contents.
        materialize: (bool, optional). Defaults to False.
            Whether to materialize the content by loader.
        format: (str, optional). Defaults to None.
            Output format of annotations. Can be None, "spacy" for dataset
            Entity Extraction type or "yolo" for Object Detection type.
            When None, it outputs List[NERItem] or List[BoundingBoxItem].
            When "spacy", it outputs List[Tuple].
            When "yolo", it outputs List[List[Tuple]].
        categories: (List[str], optional). Defaults to None.
            The list of object categories in proper order for model training.
            Example: ['cat','dog','horse']
        includes_metadata: (bool, optional). Defaults to False.
            Determines whether the export file includes metadata or not.

        Returns
        -------
        RecordReader
            A RecordReader instance.
        """
        reader = ExportRecordReader(
            path=path, auth=auth, encoding="utf-8", includes_metadata=includes_metadata
        )
        parser = RecordParserFactory.parser(
            annotation_type=annotation_type,
            dataset_source_path=dataset_source_path,
            format=format,
            categories=categories,
        )
        loader = FileLoaderFactory.loader(
            dataset_type=dataset_type,
            auth=auth,
        )
        return cls(
            reader=reader,
            parser=parser,
            loader=loader,
            materialize=materialize,
            include_unlabeled=include_unlabeled,
            encoding=encoding,
        )

[docs]    @classmethod
    def from_DLS(
        cls,
        dataset_id: str,
        dataset_type: str,
        annotation_type: str,
        dataset_source_path: str,
        compartment_id: str = None,
        auth: Dict = None,
        include_unlabeled: bool = False,
        encoding: str = "utf-8",
        materialize: bool = False,
        format: str = None,
        categories: List[str] = None,
    ) -> "RecordReader":
        """Constructs Record Reader instance.

        Parameters
        ----------
        dataset_id: str
            The dataset OCID.
        dataset_type: str
            Dataset type. Currently supports TEXT, IMAGE and DOCUMENT.
        annotation_type: str
            Annotation Type. Currently TEXT supports SINGLE_LABEL, MULTI_LABEL,
            ENTITY_EXTRACTION. IMAGE supports SINGLE_LABEL, MULTI_LABEL and BOUNDING_BOX.
            DOCUMENT supports SINGLE_LABEL and MULTI_LABEL.
        dataset_source_path: str
            Dataset source path.
        compartment_id: (str, optional). Defaults to None.
            The compartment OCID of the dataset.
        auth: (dict, optional). Defaults to None.
            The default authetication is set using `ads.set_auth` API. If you need to override the
            default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
            authentication signer and kwargs required to instantiate IdentityClient object.
        encoding: (str, optional). Defaults to 'utf-8'.
            Encoding for files.
        materialize: (bool, optional). Defaults to False.
            Whether the content of the dataset file should be loaded or it should return the file path to the content.
            By default the content will not be loaded.
        format: (str, optional). Defaults to None.
            Output format of annotations. Can be None, "spacy" for dataset
            Entity Extraction type or "yolo" for Object Detection type.
            When None, it outputs List[NERItem] or List[BoundingBoxItem].
            When "spacy", it outputs List[Tuple].
            When "yolo", it outputs List[List[Tuple]].
        categories: (List[str], optional). Defaults to None.
            The list of object categories in proper order for model training.
            Example: ['cat','dog','horse']

        Returns
        -------
        RecordReader
            The RecordReader instance.
        """

        if compartment_id is None:
            compartment_id = NB_SESSION_COMPARTMENT_OCID or JOB_RUN_COMPARTMENT_OCID

        if not compartment_id:
            raise ValueError("The `compartment_id` must be provided.")

        reader = DLSRecordReader(
            compartment_id=compartment_id, dataset_id=dataset_id, auth=auth
        )
        parser = DLSRecordParserFactory.parser(
            annotation_type=annotation_type,
            dataset_source_path=dataset_source_path,
            format=format,
            categories=categories,
        )
        loader = FileLoaderFactory.loader(
            dataset_type=dataset_type,
            auth=auth,
        )
        return cls(
            reader=reader,
            parser=parser,
            loader=loader,
            materialize=materialize,
            include_unlabeled=include_unlabeled,
            encoding=encoding,
        )

[docs]    def read(self) -> Generator[Tuple[str, Union[List, str]], Any, Any]:
        """Reads the record.

        Yields
        ------
        Generator[Tuple[str, Union[List, str]], Any, Any]
            File path, content and labels in a tuple.
        """
        for item in self.reader.read():
            if not item:
                return None
            record = self.parser.parse(item)
            if record.annotation or self.include_unlabeled:
                if self.materialize:
                    record.content = self.loader.load(
                        record.path, **{"encoding": self.encoding}
                    )
                yield record.to_tuple()