Source code for ads.data_labeling.parser.export_metadata_parser

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from typing import Any, Dict

from ads.common.object_storage_details import ObjectStorageDetails
from ads.data_labeling.interface.parser import Parser
from ads.data_labeling.metadata import Metadata


[docs] class MetadataParser(Parser): """MetadataParser class which parses the metadata from the record.""" EXPECTED_KEYS = [ "id", "compartmentId", "displayName", "labelsSet", "annotationFormat", "datasetSourceDetails", "datasetFormatDetails", ]
[docs] @staticmethod def parse(json_data: Dict[Any, Any]) -> "Metadata": """Parses the metadata jsonl file. Parameters ---------- json_data: dict dictionary format of the metadata jsonl file content. Returns ------- Metadata Metadata object which contains the useful fields from the metadata jsonl file """ MetadataParser._validate(json_data) source_path = ObjectStorageDetails( json_data["datasetSourceDetails"]["bucket"], json_data["datasetSourceDetails"]["namespace"], json_data["datasetSourceDetails"]["prefix"], ).path records_path = "" if "recordFiles" in json_data: records_path = ObjectStorageDetails( json_data["recordFiles"][0]["bucket"], json_data["recordFiles"][0]["namespace"], json_data["recordFiles"][0]["path"], ).path return Metadata( source_path=source_path, records_path=records_path, labels=[clss["name"] for clss in json_data["labelsSet"]], dataset_name=json_data["displayName"], compartment_id=json_data["compartmentId"], dataset_id=json_data["id"], annotation_type=json_data["annotationFormat"], dataset_type=json_data["datasetFormatDetails"]["formatType"], )
@staticmethod def _validate(json_data: Dict[Any, Any]) -> None: """Validates the metadata jsonl file to ensure it contains certain fields. Parameters ---------- json_data: dict dictionary format of the metadata jsonl file content. """ def invalid_message(param): return ( f"The dataset metadata file is invalid. The field '{param}' is required but it is missing. " + "Update the metadata file or use the `DataLabeling.export()` method " + "to create a new dataset metadata file." ) for k in MetadataParser.EXPECTED_KEYS: if k not in json_data: raise ValueError(f"{invalid_message(k)}") expected_list_format = ["labelsSet", "recordFiles"] for k in expected_list_format: if k in json_data and not isinstance(json_data[k], list): raise ValueError(f"{invalid_message(k)}") expected_dict_format = ["datasetSourceDetails", "datasetFormatDetails"] for k in expected_dict_format: if not isinstance(json_data[k], dict): raise ValueError(f"{invalid_message(k)}")