Source code for ads.data_labeling.loader.file_loader

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from io import BytesIO
from typing import Any, Dict, List, Union

import fsspec
import PIL
from ads.common import auth as authutil
from ads.data_labeling.constants import DatasetType
from ads.data_labeling.interface.loader import Loader
from ads.text_dataset.dataset import TextDatasetFactory, backends
from PIL import Image

THREAD_POOL_MAX_WORKERS = 10


[docs]class FileLoader: """FileLoader Base Class. Attributes: ---------- auth: (dict, optional). Defaults to None. The default authetication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate authentication signer and kwargs required to instantiate IdentityClient object. Examples -------- >>> from ads.data_labeling.loader.file_loader import FileLoader >>> import oci >>> import os >>> from ads.common import auth as authutil >>> path = "path/to/your_text_file.txt" >>> file_content = FileLoader(auth=authutil.api_keys()).load(path) """ def __init__(self, auth: Dict = None) -> "FileLoader": """Initiates a FileLoader instance. Parameters ---------- auth: (dict, optional). Defaults to None. The default authetication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate authentication signer and kwargs required to instantiate IdentityClient object. """ self.auth = auth or authutil.default_signer()
[docs] def load(self, path: str, **kwargs) -> BytesIO: """Loads the file content from the path. Parameters ---------- path: str The file path, can be local or object storage path. kwargs: Nothing. Returns ------- BytesIO The data in BytesIO format. """ data = None try: with fsspec.open(path, **self.auth) as f: data = f.read() except FileNotFoundError: raise FileNotFoundError(f"{path} not found.") except Exception as e: raise e return data
[docs] def bulk_load(self, paths: List[str], **kwargs) -> Dict[str, Any]: """Loads the files content from the list of paths. The ThreadPoolExecutor is used to load the files in parallel threads. Parameters ---------- paths: List[str] The list of file paths, can be local or object storage paths. Returns ------- Dict[str, Any] The map between file path and file content. """ result = {} if not paths or not isinstance(paths, list) or len(paths) == 0: return result with ThreadPoolExecutor(max_workers=THREAD_POOL_MAX_WORKERS) as pool: futures = {pool.submit(self.load, path, **kwargs): path for path in paths} result = {futures[task]: task.result() for task in as_completed(futures)} return result
[docs]class TextFileLoader(FileLoader): """ TextFileLoader class which loads text files. Examples -------- >>> from ads.data_labeling import TextFileLoader >>> import oci >>> import os >>> from ads.common import auth as authutil >>> path = "path/to/your_text_file.txt" >>> file_content = TextFileLoader(auth=authutil.api_keys()).load(path) """
[docs] def load( self, path: str, backend: Union[str, backends.Base] = "default", **kwargs ) -> str: """Loads the content from the path. Parameters ---------- path: str Text file path, can be local or object storage path. backend: Union[str, backends.Base] Default to "default". Valid options are "default" and "tika" or ads.text_dataset.backends.Base, ads.text_dataset.backends.Tika kwargs: encoding: (str, optional). Defaults to 'utf-8'. Encoding for text files. Used only to extract the content of the text dataset contents. Returns ------- str Content of the text file. """ format = os.path.splitext(path)[1].replace(".", "") try: content = next( TextDatasetFactory.format(format.lower()) .backend(backend) .read_text(path, storage_options=self.auth, **kwargs) )[0] except FileNotFoundError: raise FileNotFoundError(f"{path} not found.") except Exception as e: raise e return content
[docs]class ImageFileLoader(FileLoader): """ ImageFileLoader class which loads image files. Examples -------- >>> from ads.data_labeling import ImageFileLoader >>> import oci >>> import os >>> from ads.common import auth as authutil >>> path = "path/to/image.png" >>> image = ImageFileLoader(auth=authutil.api_keys()).load(path) """
[docs] def load(self, path: str, **kwargs) -> PIL.ImageFile.ImageFile: """Loads the image from the path. Parameters ---------- path: str Image file path, can be local or object storage path. kwargs: Nothing. Returns ------- PIL.ImageFile.ImageFile Image opened by Pillow. """ data = None data = super().load(path=path) return Image.open(BytesIO(data))
[docs]class FileLoaderFactory: """FileLoaderFactory class to create/register FileLoaders.""" _loaders = { DatasetType.TEXT: TextFileLoader, DatasetType.IMAGE: ImageFileLoader, DatasetType.DOCUMENT: FileLoader, }
[docs] @staticmethod def loader(dataset_type: str, auth: Dict = None) -> FileLoader: """Gets the loader based on the dataset_type. Parameters ---------- dataset_type: str Dataset type. Currently supports TEXT, IMAGE and DOCUMENT. auth: (dict, optional). Defaults to None. The default authetication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate authentication signer and kwargs required to instantiate IdentityClient object. Returns ------- FileLoader A FileLoader instance corresponding to the dataset_type. """ if not dataset_type in FileLoaderFactory._loaders: raise ValueError( f"The wrong dataset type has been provided. " f"Supported dataset types are: `{DatasetType.TEXT}`, " f"`{DatasetType.IMAGE}` and `{DatasetType.DOCUMENT}`." ) return FileLoaderFactory._loaders[dataset_type](auth=auth)
[docs] @classmethod def register(cls, dataset_type: str, loader: Loader) -> None: """Registers a new loader for a given dataset_type. Parameters ---------- dataset_type: str Dataset type. Currently supports TEXT and IMAGE. loader: Loader A Loader class which supports loading content of the given dataset_type. Returns ------- None Nothing. """ cls._parsers[dataset_type] = loader