Source code for ads.data_labeling.reader.dataset_reader

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

"""
The module containing classes to read labeled datasets.
Allows to read labeled datasets from exports or from the cloud.

Classes
-------
    LabeledDatasetReader
        The LabeledDatasetReader class to read labeled dataset.
    ExportReader
        The ExportReader class to read labeled dataset from the export.
    DLSDatasetReader
        The DLSDatasetReader class to read labeled dataset from the cloud.

Examples
--------
    >>> from ads.common import auth as authutil
    >>> from ads.data_labeling import LabeledDatasetReader
    >>> ds_reader = LabeledDatasetReader.from_export(
    ...    path="oci://bucket_name@namespace/dataset_metadata.jsonl",
    ...    auth=authutil.api_keys(),
    ...    materialize=True
    ... )
    >>> ds_reader.info()
        ------------------------------------------------------------------------
        annotation_type	                                            SINGLE_LABEL
        compartment_id	                                        TEST_COMPARTMENT
        dataset_id                                                  TEST_DATASET
        dataset_name                                           test_dataset_name
        dataset_type                                                        TEXT
        labels	                                                   ['yes', 'no']
        records_path                                             path/to/records
        source_path                                              path/to/dataset

    >>> ds_reader.read()
                                 Path            Content            Annotations
        -----------------------------------------------------------------------
        0   path/to/the/content/file1       file content                    yes
        1   path/to/the/content/file2       file content                     no
        2   path/to/the/content/file3       file content                     no

    >>> next(ds_reader.read(iterator=True))
        ("path/to/the/content/file1", "file content", "yes")

    >>> next(ds_reader.read(iterator=True, chunksize=2))
        [("path/to/the/content/file1", "file content", "yes"),
        ("path/to/the/content/file2", "file content", "no")]

    >>> next(ds_reader.read(chunksize=2))
                                Path            Content            Annotations
        ----------------------------------------------------------------------
        0   path/to/the/content/file1       file content                    yes
        1   path/to/the/content/file2       file content                     no

    >>> ds_reader = LabeledDatasetReader.from_DLS(
    ...    dataset_id="dataset_OCID",
    ...    compartment_id="compartment_OCID",
    ...    auth=authutil.api_keys(),
    ...    materialize=True
    ... )
"""

from functools import lru_cache
from typing import Any, Dict, Generator, Tuple, Union

import pandas as pd
from ads.common import auth as authutil
from ads.common.serializer import Serializable
from ads.data_labeling.interface.reader import Reader
from ads.data_labeling.reader.metadata_reader import Metadata, MetadataReader
from ads.data_labeling.reader.record_reader import RecordReader
from ads.data_labeling.constants import (
    FORMATS_TO_ANNOTATION_TYPE,
    ANNOTATION_TYPE_TO_FORMATS,
)
from ads.config import NB_SESSION_COMPARTMENT_OCID, JOB_RUN_COMPARTMENT_OCID

_LABELED_DF_COLUMNS = ["Path", "Content", "Annotations"]


[docs]class LabeledDatasetReader:
    """The labeled dataset reader class.

    Methods
    -------
    info(self) -> Metadata
        Gets labeled dataset metadata.
    read(self, iterator: bool = False) -> Union[Generator[Any, Any, Any], pd.DataFrame]
        Reads labeled dataset.
    from_export(cls, path: str, auth: Dict = None, encoding="utf-8", materialize: bool = False) -> "LabeledDatasetReader"
        Constructs a Labeled Dataset Reader instance.

    Examples
    --------
    >>> from ads.common import auth as authutil
    >>> from ads.data_labeling import LabeledDatasetReader

    >>> ds_reader = LabeledDatasetReader.from_export(
    ...    path="oci://bucket_name@namespace/dataset_metadata.jsonl",
    ...    auth=authutil.api_keys(),
    ...    materialize=True
    ... )

    >>> ds_reader = LabeledDatasetReader.from_DLS(
    ...    dataset_id="dataset_OCID",
    ...    compartment_id="compartment_OCID",
    ...    auth=authutil.api_keys(),
    ...    materialize=True
    ... )

    >>> ds_reader.info()
        ------------------------------------------------------------------------
        annotation_type	                                            SINGLE_LABEL
        compartment_id	                                        TEST_COMPARTMENT
        dataset_id                                                  TEST_DATASET
        dataset_name                                           test_dataset_name
        dataset_type                                                        TEXT
        labels	                                                   ['yes', 'no']
        records_path                                             path/to/records
        source_path                                              path/to/dataset

    >>> ds_reader.read()
                                 Path            Content            Annotations
        -----------------------------------------------------------------------
        0   path/to/the/content/file1       file content                    yes
        1   path/to/the/content/file2       file content                     no
        2   path/to/the/content/file3       file content                     no

    >>> next(ds_reader.read(iterator=True))
        ("path/to/the/content/file1", "file content", "yes")

    >>> next(ds_reader.read(iterator=True, chunksize=2))
        [("path/to/the/content/file1", "file content", "yes"),
        ("path/to/the/content/file2", "file content", "no")]

    >>> next(ds_reader.read(chunksize=2))
                                Path            Content            Annotations
        ----------------------------------------------------------------------
        0   path/to/the/content/file1       file content                    yes
        1   path/to/the/content/file2       file content                     no
    """

    def __init__(self, reader: Reader):
        """Initializes the labeled dataset reader instance.

        Parameters
        ----------
        reader: Reader
            The Reader instance which reads and extracts the labeled dataset.
        """
        self._reader = reader

[docs]    @classmethod
    def from_export(
        cls,
        path: str,
        auth: dict = None,
        encoding: str = "utf-8",
        materialize: bool = False,
        include_unlabeled: bool = False,
    ) -> "LabeledDatasetReader":
        """Constructs Labeled Dataset Reader instance.

        Parameters
        ----------
        path: str
            The metadata file path, can be either local or object storage path.
        auth: (dict, optional). Defaults to None.
            The default authetication is set using `ads.set_auth` API. If you need to override the
            default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
            authentication signer and kwargs required to instantiate IdentityClient object.
        encoding: (str, optional). Defaults to 'utf-8'.
            Encoding for files.
        materialize: (bool, optional). Defaults to False.
            Whether the content of the dataset file should be loaded or it should return the file path to the content.
            By default the content will not be loaded.

        Returns
        -------
        LabeledDatasetReader
            The LabeledDatasetReader instance.
        """
        auth = auth or authutil.default_signer()

        return cls(
            reader=ExportReader(
                path=path,
                auth=auth,
                encoding=encoding,
                materialize=materialize,
                include_unlabeled=include_unlabeled,
            )
        )

[docs]    @classmethod
    def from_DLS(
        cls,
        dataset_id: str,
        compartment_id: str = None,
        auth: dict = None,
        encoding: str = "utf-8",
        materialize: bool = False,
        include_unlabeled: bool = False,
    ) -> "LabeledDatasetReader":
        """Constructs Labeled Dataset Reader instance.

        Parameters
        ----------
        dataset_id: str
            The dataset OCID.
        compartment_id: str. Defaults to the compartment_id from the env variable.
            The compartment OCID of the dataset.
        auth: (dict, optional). Defaults to None.
            The default authetication is set using `ads.set_auth` API. If you need to override the
            default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
            authentication signer and kwargs required to instantiate IdentityClient object.
        encoding: (str, optional). Defaults to 'utf-8'.
            Encoding for files.
        materialize: (bool, optional). Defaults to False.
            Whether the content of the dataset file should be loaded or it should return the file path to the content.
            By default the content will not be loaded.

        Returns
        -------
        LabeledDatasetReader
            The LabeledDatasetReader instance.
        """
        if compartment_id is None:
            compartment_id = NB_SESSION_COMPARTMENT_OCID or JOB_RUN_COMPARTMENT_OCID

        if not compartment_id:
            raise ValueError("The `compartment_id` must be provided.")

        return cls(
            reader=DLSDatasetReader(
                compartment_id=compartment_id,
                dataset_id=dataset_id,
                auth=auth or authutil.default_signer(),
                encoding=encoding,
                materialize=materialize,
                include_unlabeled=include_unlabeled,
            )
        )

[docs]    def info(self) -> Serializable:
        """Gets the labeled dataset metadata.

        Returns
        -------
        Metadata
            The labeled dataset metadata.
        """
        return self._reader.info()

[docs]    def read(
        self, iterator: bool = False, format: str = None, chunksize: int = None
    ) -> Union[Generator[Any, Any, Any], pd.DataFrame]:
        """Reads the labeled dataset records.

        Parameters
        ----------
        iterator: (bool, optional). Defaults to False.
            True if the result should be represented as a Generator.
            Fasle if the result should be represented as a Pandas DataFrame.
        format: (str, optional). Defaults to None.
            Output format of annotations. Can be None, "spacy" or "yolo".
        chunksize: (int, optional). Defaults to None.
            The number of records that should be read in one iteration.
            The result will be returned in a generator format.

        Returns
        -------
        Union[
            Generator[Tuple[str, str, Any], Any, Any],
            Generator[List[Tuple[str, str, Any]], Any, Any],
            Generator[pd.DataFrame, Any, Any],
            pd.DataFrame
        ]
            `pd.Dataframe` if `iterator` and `chunksize` are not specified.
            `Generator[pd.Dataframe] ` if `iterator` equal to False and `chunksize` is specified.
            `Generator[List[Tuple[str, str, Any]]]` if `iterator` equal to True and `chunksize` is specified.
            `Generator[Tuple[str, str, Any]]` if `iterator` equal to True and `chunksize` is not specified.
        """

        if chunksize:
            return self._bulk_read(
                iterator=iterator, format=format, chunksize=chunksize
            )

        if iterator:
            return self._reader.read(format=format)

        return pd.DataFrame(
            self._reader.read(format=format), columns=_LABELED_DF_COLUMNS
        )

    def _bulk_read(
        self, iterator: bool = False, format: str = None, chunksize: int = None
    ) -> Generator[Union[pd.DataFrame, Tuple[str, str, Any]], Any, Any]:
        """Reads the labeled dataset records by chunks.

        Parameters
        ----------
        iterator: (bool, optional). Defaults to False.
            True if the result should be represented as a Generator.
            Fasle if the result should be represented as a Pandas DataFrame.
        format: (str, optional). Defaults to None.
            Output format of annotations. Can be None, "spacy" or "yolo".
        chunksize: (int, optional). Defaults to None.
            The number of records that should be read in one iteration.
            Result will be represented as a generator.

        Yields
        -------
        Generator[Union[pd.DataFrame, Tuple[str, str, Any]], Any, Any]
            The generator that yields records either in Dataframe format or Tuple.

        Raises
        ------
            ValueError: If chunksize is empty or not a positive integer.
        """
        if not chunksize or not isinstance(chunksize, int) or chunksize < 1:
            raise ValueError("`chunksize` must be a positive integer.")

        result = []
        i = 0
        for record in self._reader.read(format=format):
            result.append(record)
            i += 1
            if i >= chunksize:
                yield result if iterator else pd.DataFrame(
                    result, columns=_LABELED_DF_COLUMNS
                )
                result = []
                i = 0
        if result:
            yield result if iterator else pd.DataFrame(
                result, columns=_LABELED_DF_COLUMNS
            )


[docs]class DLSDatasetReader(Reader):
    """The DLSDatasetReader class to read labeled dataset from the cloud.

    Methods
    -------
    info(self) -> Metadata
        Gets the labeled dataset metadata.
    read(self) -> Generator[Tuple, Any, Any]
        Reads the labeled dataset.
    """

    def __init__(
        self,
        dataset_id: str,
        compartment_id: str,
        auth: Dict,
        encoding="utf-8",
        materialize: bool = False,
        include_unlabeled: bool = False,
    ):
        """Initializes the DLS dataset reader instance.

        Parameters
        ----------
        dataset_id: str
            The dataset OCID.
        compartment_id: str
            The compartment OCID of the dataset.
        auth: (dict, optional). Defaults to None.
            The default authetication is set using `ads.set_auth` API. If you need to override the
            default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
            authentication signer and kwargs required to instantiate IdentityClient object.
        encoding: (str, optional). Defaults to 'utf-8'.
            Encoding for files. The encoding is used to extract the metadata information
            of the labeled dataset and also to extract the content of the text dataset records.
        materialize: (bool, optional). Defaults to False.
            Whether the content of dataset files should be loaded/materialized or not.
            By default the content will not be materialized.
        include_unlabeled: (bool, optional). Defaults to False.
            Whether to load the unlabeled records or not.

        Raises
        ------
            ValueError: When dataset_id is empty or not a string.
            TypeError: When dataset_id not a string.
        """
        if not dataset_id:
            raise ValueError("The dataset OCID must be specified.")

        if not isinstance(dataset_id, str):
            raise TypeError("The dataset_id must be a string.")

        if not compartment_id:
            raise ValueError("The compartment OCID must be specified.")

        self.dataset_id = dataset_id
        self.compartment_id = compartment_id
        self.auth = auth or authutil.default_signer()
        self.encoding = encoding
        self.materialize = materialize
        self.include_unlabeled = include_unlabeled

[docs]    @lru_cache(maxsize=1)
    def info(self) -> Metadata:
        """Gets the labeled dataset metadata.

        Returns
        -------
        Metadata
            The labeled dataset metadata.
        """
        return MetadataReader.from_DLS(
            compartment_id=self.compartment_id,
            dataset_id=self.dataset_id,
        ).read()

[docs]    def read(self, format: str = None) -> Generator[Tuple, Any, Any]:
        """Reads the labeled dataset records.

        Parameters
        ----------
        format: (str, optional). Defaults to None.
            Output format of annotations. Can be None, "spacy" for dataset
            Entity Extraction type or "yolo" for Object Detection type.
            When None, it outputs List[NERItem] or List[BoundingBoxItem].
            When "spacy", it outputs List[Tuple].
            When "yolo", it outputs List[List[Tuple]].

        Returns
        -------
        Generator[Tuple, Any, Any]
            The labeled dataset records.
        """

        metadata = self.info()

        records_reader = RecordReader.from_DLS(
            dataset_type=metadata.dataset_type,
            annotation_type=metadata.annotation_type,
            dataset_source_path=metadata.source_path,
            compartment_id=self.compartment_id,
            dataset_id=self.dataset_id,
            auth=self.auth,
            encoding=self.encoding,
            materialize=self.materialize,
            include_unlabeled=self.include_unlabeled,
            format=format,
            categories=metadata.labels,
        )
        return records_reader.read()


[docs]class ExportReader(Reader):
    """The ExportReader class to read labeled dataset from the export.

    Methods
    -------
    info(self) -> Metadata
        Gets the labeled dataset metadata.
    read(self) -> Generator[Tuple, Any, Any]
        Reads the labeled dataset.
    """

    def __init__(
        self,
        path: str,
        auth: Dict = None,
        encoding="utf-8",
        materialize: bool = False,
        include_unlabeled: bool = False,
    ):
        """Initializes the labeled dataset export reader instance.

        Parameters
        ----------
        path: str
            The metadata file path, can be either local or object storage path.
        auth: (dict, optional). Defaults to None.
            The default authetication is set using `ads.set_auth` API. If you need to override the
            default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
            authentication signer and kwargs required to instantiate IdentityClient object.
        encoding: (str, optional). Defaults to 'utf-8'.
            Encoding for files. The encoding is used to extract the metadata information
            of the labeled dataset and also to extract the content of the text dataset records.
        materialize: (bool, optional). Defaults to False.
            Whether the content of dataset files should be loaded/materialized or not.
            By default the content will not be materialized.
        include_unlabeled: (bool, optional). Defaults to False.
            Whether to load the unlabeled records or not.

        Raises
        ------
            ValueError: When path is empty or not a string.
            TypeError: When path not a string.
        """

        if not path:
            raise ValueError("The parameter `path` is required.")

        if not isinstance(path, str):
            raise TypeError("The parameter `path` must be a string.")

        self.path = path
        self.auth = auth or authutil.default_signer()
        self.encoding = encoding
        self.materialize = materialize
        self.include_unlabeled = include_unlabeled

[docs]    @lru_cache(maxsize=1)
    def info(self) -> Metadata:
        """Gets the labeled dataset metadata.

        Returns
        -------
        Metadata
            The labeled dataset metadata.
        """
        return MetadataReader.from_export_file(
            path=self.path,
            auth=self.auth,
        ).read()

[docs]    def read(self, format: str = None) -> Generator[Tuple, Any, Any]:
        """Reads the labeled dataset records.

        Parameters
        ----------
        format: (str, optional). Defaults to None.
            Output format of annotations. Can be None, "spacy" for dataset
            Entity Extraction type or "yolo" for Object Detection type.
            When None, it outputs List[NERItem] or List[BoundingBoxItem].
            When "spacy", it outputs List[Tuple].
            When "yolo", it outputs List[List[Tuple]].

        Returns
        -------
        Generator[Tuple, Any, Any]
            The labeled dataset records.
        """
        metadata = self.info()
        if (
            format
            and isinstance(format, str)
            and (
                format.lower() not in FORMATS_TO_ANNOTATION_TYPE.keys()
                or FORMATS_TO_ANNOTATION_TYPE[format.lower()]
                != metadata.annotation_type
            )
        ):
            raise TypeError(
                "Wrong format. `format` can only be None or "
                f"`{ANNOTATION_TYPE_TO_FORMATS[metadata.annotation_type]}`."
            )

        records_reader = RecordReader.from_export_file(
            path=metadata.records_path or self.path,
            dataset_type=metadata.dataset_type,
            annotation_type=metadata.annotation_type,
            dataset_source_path=metadata.source_path,
            auth=self.auth,
            encoding=self.encoding,
            materialize=self.materialize,
            include_unlabeled=self.include_unlabeled,
            format=format,
            categories=metadata.labels,
            includes_metadata=not metadata.records_path,
        )
        return records_reader.read()