Source code for ads.data_labeling.mixin.data_labeling

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from typing import Dict, List
from ads.common import auth as authutil
from ads.data_labeling.reader.dataset_reader import LabeledDatasetReader
from ads.data_labeling.visualizer import image_visualizer, text_visualizer
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)

ROWS_TO_RENDER_LIMIT = 50



[docs]
class DataLabelingAccessMixin:
    """Mixin class for labeled text data."""


[docs]
    @staticmethod
    def read_labeled_data(
        path: str = None,
        dataset_id: str = None,
        compartment_id: str = None,
        auth: Dict = None,
        materialize: bool = False,
        encoding: str = "utf-8",
        include_unlabeled: bool = False,
        format: str = None,
        chunksize: int = None,
    ):
        """Loads the dataset generated by data labeling service from either the export file or the Data Labeling Service.

        Parameters
        ----------
        path: (str, optional). Defaults to None
            The export file path, can be either local or object storage path.
        dataset_id: (str, optional). Defaults to None
            The dataset OCID.
        compartment_id: str. Defaults to the compartment_id from the env variable.
            The compartment OCID of the dataset.
        auth: (dict, optional). Defaults to None
            The default authetication is set using `ads.set_auth` API. If you need to override the
            default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
            authentication signer and kwargs required to instantiate IdentityClient object.
        materialize: (bool, optional). Defaults to False
            Whether the content of the dataset file should be loaded or it should return the file path to the content.
            By default the content will not be loaded.
        encoding: (str, optional). Defaults to 'utf-8'
            Encoding of files. Only used for "TEXT" dataset.
        include_unlabeled: (bool, optional). Default to False
            Whether to load the unlabeled records or not.
        format: (str, optional). Defaults to None
            Output format of annotations. Can be None, "spacy" for dataset
            Entity Extraction type or "yolo for Object Detection type.

            - When None, it outputs List[NERItem] or List[BoundingBoxItem],
            - When "spacy", it outputs List[Tuple],
            - When "yolo", it outputs List[List[Tuple]].

        chunksize: (int, optional). Defaults to None
            The amount of records that should be read in one iteration.
            The result will be returned in a generator format.

        Returns
        -------
        Union[Generator[pd.DataFrame, Any, Any], pd.DataFrame]
            `pd.Dataframe` if `chunksize` is not specified.
            `Generator[pd.Dataframe]` if `chunksize` is specified.

        Examples
        --------
        >>> import pandas as pd
        >>> import ads
        >>> from ads.common import auth as authutil
        >>> df = pd.DataFrame.ads.read_labeled_data(path="path_to_your_metadata.jsonl",
        ...                                         auth=authutil.api_keys(),
        ...                                         materialize=False)
                                    Path       Content               Annotations
            --------------------------------------------------------------------
            0   path/to/the/content/file                                     yes
            1   path/to/the/content/file                                      no

        >>> df = pd.DataFrame.ads.read_labeled_data_from_dls(dataset_id="your_dataset_ocid",
        ...                                                  compartment_id="your_compartment_id",
        ...                                                  auth=authutil.api_keys(),
        ...                                                  materialize=False)
                                    Path       Content               Annotations
            --------------------------------------------------------------------
            0   path/to/the/content/file                                     yes
            1   path/to/the/content/file                                      no
        """
        auth = auth or authutil.default_signer()

        if not path and not dataset_id:
            raise ValueError("Either `path` or the `dataset_id` need to be provided.")
        if path and dataset_id:
            raise ValueError("Only one of `path` and `dataset_id` need to be provided.")
        if path:
            reader = LabeledDatasetReader.from_export(
                path=path,
                auth=auth,
                encoding=encoding,
                materialize=materialize,
                include_unlabeled=include_unlabeled,
            )
        else:
            reader = LabeledDatasetReader.from_DLS(
                dataset_id=dataset_id,
                compartment_id=compartment_id,
                auth=auth,
                encoding=encoding,
                materialize=materialize,
                include_unlabeled=include_unlabeled,
            )

        ds_info = reader.info()
        if chunksize:

            def _bulk_read():
                for df in reader.read(format=format, chunksize=chunksize):
                    df.ads._info = ds_info
                    yield df

            return _bulk_read()

        df = reader.read(format=format)
        df.ads._info = ds_info
        return df



[docs]
    def render_bounding_box(
        self,
        options: Dict = None,
        content_column: str = "Content",
        annotations_column: str = "Annotations",
        categories: List[str] = None,
        limit: int = ROWS_TO_RENDER_LIMIT,
        path: str = None,
    ) -> None:
        """Renders bounding box dataset. Displays only first 50 rows.

        Parameters
        ----------
        options: dict
            The colors options specified for rendering.
        content_column: Optional[str]
            The column name with the content data.
        annotations_column: Optional[str]
            The column name for the annotations list.
        categories: Optional List[str]
            The list of object categories in proper order for model training.
            Only used when bounding box annotations are in YOLO format.
            Example: ['cat','dog','horse']
        limit: Optional[int]. Defaults to 50
            The maximum amount of records to display.
        path: Optional[str]
            Path to save the image with annotations to local directory.

        Returns
        -------
        None
            Nothing

        Examples
        --------
        >>> import pandas as pd
        >>> import ads
        >>> from ads.common import auth as authutil
        >>> df = pd.DataFrame.ads.read_labeled_data(path="path_to_your_metadata.jsonl",
        ...                                         auth=authutil.api_keys(),
        ...                                         materialize=True)
        >>> df.ads.render_bounding_box(content_column="Content", annotations_column="Annotations")
        """
        items = image_visualizer._df_to_bbox_items(
            self._obj.iloc[0:limit] if self._obj.shape[0] > limit else self._obj,
            content_column=content_column,
            annotations_column=annotations_column,
            categories=categories,
        )
        image_visualizer.render(items, options=options, path=path)



[docs]
    @runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK)
    def render_ner(
        self,
        options: Dict = None,
        content_column: str = "Content",
        annotations_column: str = "Annotations",
        limit: int = ROWS_TO_RENDER_LIMIT,
        return_html: bool = False,
    ) -> None:
        """Renders NER dataset. Displays only first 50 rows.

        Parameters
        ----------
        options: dict
            The colors options specified for rendering.
        content_column: Optional[str]
            The column name with the content data.
        annotations_column: Optional[str]
            The column name for the annotations list.
        limit: Optional[int]. Defaults to 50
            The maximum amount of records to display.

        Returns
        -------
        None
            Nothing

        Examples
        --------
        >>> import pandas as pd
        >>> import ads
        >>> from ads.common import auth as authutil
        >>> df = pd.DataFrame.ads.read_labeled_data(path="path_to_your_metadata.jsonl",
        ...                                         auth=authutil.api_keys(),
        ...                                         materialize=True)
        >>> df.ads.render_ner(content_column="Content", annotations_column="Annotations")
        """
        items = text_visualizer._df_to_ner_items(
            self._obj.iloc[0:limit] if self._obj.shape[0] > limit else self._obj,
            content_column=content_column,
            annotations_column=annotations_column,
        )
        result_html = text_visualizer.render(items=items, options=options)
        if return_html:
            return result_html

        from IPython.core.display import HTML, Markdown, display

        display(Markdown(result_html))