Source code for ads.data_labeling.mixin.data_labeling

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from typing import Dict, List
from ads.common import auth as authutil
from ads.data_labeling.reader.dataset_reader import LabeledDatasetReader
from ads.data_labeling.visualizer import image_visualizer, text_visualizer
from ads.common.decorator.runtime_dependency import (
    runtime_dependency,
    OptionalDependency,
)

ROWS_TO_RENDER_LIMIT = 50


[docs] class DataLabelingAccessMixin: """Mixin class for labeled text data."""
[docs] @staticmethod def read_labeled_data( path: str = None, dataset_id: str = None, compartment_id: str = None, auth: Dict = None, materialize: bool = False, encoding: str = "utf-8", include_unlabeled: bool = False, format: str = None, chunksize: int = None, ): """Loads the dataset generated by data labeling service from either the export file or the Data Labeling Service. Parameters ---------- path: (str, optional). Defaults to None The export file path, can be either local or object storage path. dataset_id: (str, optional). Defaults to None The dataset OCID. compartment_id: str. Defaults to the compartment_id from the env variable. The compartment OCID of the dataset. auth: (dict, optional). Defaults to None The default authetication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate authentication signer and kwargs required to instantiate IdentityClient object. materialize: (bool, optional). Defaults to False Whether the content of the dataset file should be loaded or it should return the file path to the content. By default the content will not be loaded. encoding: (str, optional). Defaults to 'utf-8' Encoding of files. Only used for "TEXT" dataset. include_unlabeled: (bool, optional). Default to False Whether to load the unlabeled records or not. format: (str, optional). Defaults to None Output format of annotations. Can be None, "spacy" for dataset Entity Extraction type or "yolo for Object Detection type. - When None, it outputs List[NERItem] or List[BoundingBoxItem], - When "spacy", it outputs List[Tuple], - When "yolo", it outputs List[List[Tuple]]. chunksize: (int, optional). Defaults to None The amount of records that should be read in one iteration. The result will be returned in a generator format. Returns ------- Union[Generator[pd.DataFrame, Any, Any], pd.DataFrame] `pd.Dataframe` if `chunksize` is not specified. `Generator[pd.Dataframe]` if `chunksize` is specified. Examples -------- >>> import pandas as pd >>> import ads >>> from ads.common import auth as authutil >>> df = pd.DataFrame.ads.read_labeled_data(path="path_to_your_metadata.jsonl", ... auth=authutil.api_keys(), ... materialize=False) Path Content Annotations -------------------------------------------------------------------- 0 path/to/the/content/file yes 1 path/to/the/content/file no >>> df = pd.DataFrame.ads.read_labeled_data_from_dls(dataset_id="your_dataset_ocid", ... compartment_id="your_compartment_id", ... auth=authutil.api_keys(), ... materialize=False) Path Content Annotations -------------------------------------------------------------------- 0 path/to/the/content/file yes 1 path/to/the/content/file no """ auth = auth or authutil.default_signer() if not path and not dataset_id: raise ValueError("Either `path` or the `dataset_id` need to be provided.") if path and dataset_id: raise ValueError("Only one of `path` and `dataset_id` need to be provided.") if path: reader = LabeledDatasetReader.from_export( path=path, auth=auth, encoding=encoding, materialize=materialize, include_unlabeled=include_unlabeled, ) else: reader = LabeledDatasetReader.from_DLS( dataset_id=dataset_id, compartment_id=compartment_id, auth=auth, encoding=encoding, materialize=materialize, include_unlabeled=include_unlabeled, ) ds_info = reader.info() if chunksize: def _bulk_read(): for df in reader.read(format=format, chunksize=chunksize): df.ads._info = ds_info yield df return _bulk_read() df = reader.read(format=format) df.ads._info = ds_info return df
[docs] def render_bounding_box( self, options: Dict = None, content_column: str = "Content", annotations_column: str = "Annotations", categories: List[str] = None, limit: int = ROWS_TO_RENDER_LIMIT, path: str = None, ) -> None: """Renders bounding box dataset. Displays only first 50 rows. Parameters ---------- options: dict The colors options specified for rendering. content_column: Optional[str] The column name with the content data. annotations_column: Optional[str] The column name for the annotations list. categories: Optional List[str] The list of object categories in proper order for model training. Only used when bounding box annotations are in YOLO format. Example: ['cat','dog','horse'] limit: Optional[int]. Defaults to 50 The maximum amount of records to display. path: Optional[str] Path to save the image with annotations to local directory. Returns ------- None Nothing Examples -------- >>> import pandas as pd >>> import ads >>> from ads.common import auth as authutil >>> df = pd.DataFrame.ads.read_labeled_data(path="path_to_your_metadata.jsonl", ... auth=authutil.api_keys(), ... materialize=True) >>> df.ads.render_bounding_box(content_column="Content", annotations_column="Annotations") """ items = image_visualizer._df_to_bbox_items( self._obj.iloc[0:limit] if self._obj.shape[0] > limit else self._obj, content_column=content_column, annotations_column=annotations_column, categories=categories, ) image_visualizer.render(items, options=options, path=path)
[docs] @runtime_dependency(module="IPython", install_from=OptionalDependency.NOTEBOOK) def render_ner( self, options: Dict = None, content_column: str = "Content", annotations_column: str = "Annotations", limit: int = ROWS_TO_RENDER_LIMIT, return_html: bool = False, ) -> None: """Renders NER dataset. Displays only first 50 rows. Parameters ---------- options: dict The colors options specified for rendering. content_column: Optional[str] The column name with the content data. annotations_column: Optional[str] The column name for the annotations list. limit: Optional[int]. Defaults to 50 The maximum amount of records to display. Returns ------- None Nothing Examples -------- >>> import pandas as pd >>> import ads >>> from ads.common import auth as authutil >>> df = pd.DataFrame.ads.read_labeled_data(path="path_to_your_metadata.jsonl", ... auth=authutil.api_keys(), ... materialize=True) >>> df.ads.render_ner(content_column="Content", annotations_column="Annotations") """ items = text_visualizer._df_to_ner_items( self._obj.iloc[0:limit] if self._obj.shape[0] > limit else self._obj, content_column=content_column, annotations_column=annotations_column, ) result_html = text_visualizer.render(items=items, options=options) if return_html: return result_html from IPython.core.display import HTML, Markdown, display display(Markdown(result_html))