Source code for ads.data_labeling.reader.jsonl_reader
#!/usr/bin/env python
# -*- coding: utf-8; -*-
# Copyright (c) 2021, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
import json
from typing import Any, Dict, Generator
import fsspec
from ads.data_labeling.interface.reader import Reader
from ads.common import auth as authutil
[docs]class JsonlReader(Reader):
"""JsonlReader class which reads the file."""
def __init__(self, path: str, auth: Dict = None, encoding="utf-8") -> "JsonlReader":
"""Initiates a JsonlReader object.
Parameters
----------
path: str
object storage path or local path for a file.
auth: (dict, optional). Defaults to None.
The default authetication is set using `ads.set_auth` API. If you need to override the
default, use the `ads.common.auth.api_keys` or `ads.common.auth.resource_principal` to create appropriate
authentication signer and kwargs required to instantiate IdentityClient object.
encoding : (str, optional). Defaults to 'utf-8'.
Encoding of files. Only used for "TEXT" dataset.
Examples
--------
>>> from ads.data_labeling.reader.jsonl_reader import JsonlReader
>>> path = "your/path/to/jsonl/file.jsonl"
>>> from ads.common import auth as authutil
>>> reader = JsonlReader(path=path, auth=authutil.api_keys(), encoding="utf-8")
>>> next(reader.read())
"""
self.path = path
self.auth = auth or authutil.default_signer()
self.encoding = encoding
[docs] def read(self, skip: int = None) -> Generator[Dict, Any, Any]:
"""Reads and yields the content of the file.
Parameters
----------
skip: (int, optional). Defaults to None.
The number of records that should be skipped.
Returns
-------
Generator[Dict, Any, Any]
The content of the file.
Raises
------
ValueError
If `skip` not empty and not a positive integer.
FileNotFoundError
When file not found.
"""
if skip and (not isinstance(skip, int) or skip < 1):
raise ValueError("The parameter `skip` must be a positive integer.")
try:
line_number = 0
with fsspec.open(self.path, "r", encoding=self.encoding, **self.auth) as f:
for line in f:
line_number += 1
if skip and line_number <= skip:
continue
yield json.loads(line)
except FileNotFoundError:
raise FileNotFoundError(f"Path ({self.path}) not found.")