ads.text_dataset package#

Submodules#

ads.text_dataset.backends module#

class ads.text_dataset.backends.Base[source]#

Bases: object

Base class for backends.

convert_to_text(fhandler: OpenFile, dst_path: str, fname: str | None = None, storage_options: Dict | None = None) → str[source]#

Convert input file to a text file

Parameters:

fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
dst_path (str) – local folder or cloud storage prefix to save converted text files
fname (str, optional) – filename for converted output, relative to dirname or prefix, by default None
storage_options (dict, optional) – storage options for cloud storage

Returns:

path to saved output

Return type:

str

get_metadata(fhandler: OpenFile) → Dict[source]#

Get metadata of a file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Returns:: dictionary of metadata
Return type:: dict

read_line(fhandler: OpenFile) → Generator[str | List[str], None, None][source]#

Read lines from a file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Yields:: Generator – a generator that yields lines

read_text(fhandler: OpenFile) → Generator[str | List[str], None, None][source]#

Read entire file into a string.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Yields:: Generator – a generator that yields text in the file

class ads.text_dataset.backends.PDFPlumber[source]#

Bases: Base

convert_to_text(fhandler, dst_path, fname=None, storage_options=None)[source]#

Convert input file to a text file

Parameters:

fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
dst_path (str) – local folder or cloud storage prefix to save converted text files
fname (str, optional) – filename for converted output, relative to dirname or prefix, by default None
storage_options (dict, optional) – storage options for cloud storage

Returns:

path to saved output

Return type:

str

get_metadata(fhandler)[source]#

Get metadata of a file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Returns:: dictionary of metadata
Return type:: dict

read_line(fhandler)[source]#

Read lines from a file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Yields:: Generator – a generator that yields lines

read_text(fhandler)[source]#

Read entire file into a string.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Yields:: Generator – a generator that yields text in the file

class ads.text_dataset.backends.Tika[source]#

Bases: Base

convert_to_text(fhandler, dst_path, fname=None, storage_options=None)[source]#

Convert input file to a text file

Parameters:

fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
dst_path (str) – local folder or cloud storage prefix to save converted text files
fname (str, optional) – filename for converted output, relative to dirname or prefix, by default None
storage_options (dict, optional) – storage options for cloud storage

Returns:

path to saved output

Return type:

str

detect_encoding(fhandler: OpenFile)[source]#

get_metadata(fhandler)[source]#

Get metadata of a file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Returns:: dictionary of metadata
Return type:: dict

read_line(fhandler)[source]#

Read lines from a file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Yields:: Generator – a generator that yields lines

read_text(fhandler)[source]#

Read entire file into a string.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Yields:: Generator – a generator that yields text in the file

ads.text_dataset.dataset module#

class ads.text_dataset.dataset.DataLoader(engine: str | None = None)[source]#

Bases: object

DataLoader binds engine, FileProcessor and File handler(in this case it is fsspec) together to produce a dataframe of parsed text from files.

This class is expected to be used mainly from TextDatasetFactory class.

processor#

processor that is used for loading data.

Type:: ads.text_dataset.extractor.FileProcessor

Examples

>>> import oci
>>> from ads.text_dataset.dataset import TextDatasetFactory as textfactory
>>> from ads.text_dataset.options import Options
>>> df = textfactory.format('pdf').engine('pandas').read_line(
...     'oci://<bucket-name>@<namespace>/<path>/*.pdf',
...     storage_options={"config": oci.config.from_file(os.path.join("~/.oci", "config"))},
... )
>>> data_gen = textfactory.format('pdf').option(Options.FILE_NAME).backend('pdfplumber').read_text(
...     'oci://<bucket-name>@<namespace>/<path>/*.pdf',
...     storage_options={"config": oci.config.from_file(os.path.join("~/.oci", "config"))},
... )
>>> textfactory.format('docx').convert_to_text(
...     'oci://<bucket-name>@<namespace>/<path>/*.docx',
...     './extracted',
...     storage_options={"config": oci.config.from_file(os.path.join("~/.oci", "config"))},
... )
>>> textfactory.format('docx').convert_to_text(
...     'oci://<bucket-name>@<namespace>/<path>/*.docx',
...     'oci://<bucket-name>@<namespace>/<out_path>',
...     storage_options={"config": oci.config.from_file(os.path.join("~/.oci", "config"))},
... )
>>> meta_gen = textfactory.format('docx').metadata_schema(
...     'oci://<bucket-name>@<namespace>/papers/*.pdf',
...     storage_options={"config": oci.config.from_file(os.path.join("~/.oci", "config"))},
... )
>>> df = textfactory.format('pdf').engine('pandas').option(Options.FILE_METADATA, {'extract': ['Author']}).read_text(
...     'oci://<bucket-name>@<namespace>/<path>/*.pdf',
...     storage_options={"config": oci.config.from_file(os.path.join("~/.oci", "config"))},
...     total_files=10,
... )
>>> df = textfactory.format('txt').engine('cudf').read_line(
...     'oci://<bucket-name>@<namespace>/<path>/*.log',
...      udf=r'^\[(\S+)\s(\S+)\s(\d+)\s(\d+\:\d+\:\d+)\s(\d+)]\s(\S+)\s(\S+)\s(\S+)\s(\S+)',
...      df_args={"columns":["day", "month", "date", "time", "year", "type", "method", "status", "file"]},
...      n_lines_per_file=10,
... )

Initialize a DataLoader object.

Parameters:: engine (str, optional) – dataframe engine, by default None.
Return type:: None

backend(backend: str | Base) → None[source]#

Set backend used for extracting text from files.

Parameters:: backend ((str | ads.text_dataset.backends.Base)) – backend for extracting text from raw files.
Return type:: None

convert_to_text(src_path: str, dst_path: str, encoding: str = 'utf-8', storage_options: Dict | None = None) → None[source]#

Convert files to plain text files.

Parameters:

src_path (str) – path to source data file(s). can use glob pattern
dst_path (str) – local folder or cloud storage (e.g., OCI object storage) prefix to save converted text files
encoding (str, optional) – encoding for files, by default utf-8
storage_options (Dict, optional) – storage options for cloud storage, by default None

Return type:

None

engine(eng: str) → None[source]#

Set engine for dataloader. Can be pandas or cudf.

Parameters:: eng (str) – name of engine
Return type:: None
Raises:: NotSupportedError – raises error if engine passed in is not supported.

metadata_all(path: str, storage_options: Dict | None = None, encoding: str = 'utf-8') → Generator[Dict[str, Any], None, None][source]#

Get metadata of all files that matches the given path. Return a generator.

Parameters:

path (str) – path to data files. can use glob pattern.
storage_options (Dict, optional) – storage options for cloud storage, by default None
encoding (str, optional) – encoding of files, by default ‘utf-8’

Returns:

generator of extracted metedata from files.

Return type:

Generator

metadata_schema(path: str, n_files: int = 1, storage_options: Dict | None = None, encoding: str = 'utf-8') → List[str][source]#

Get available fields in metadata by looking at the first n_files that matches the given path.

Parameters:

path (str) – path to data files. can have glob pattern
n_files (int, optional) – number of files to look up, default to be 1
storage_options (dict, optional) – storage options for cloud storage, by default None
encoding (str, optional) – encoding of files, by default utf-8

Returns:

list of available fields in metadata

Return type:

List[str]

option(opt: Options, spec: Any | None = None) → None[source]#

Set extraction options.

Parameters:

opt (ads.text_dataset.options.Options) – an option defined in ads.text_dataset.options.Options
spec (Any, optional) – specifications that will be passed to option handler, by default None

Return type:

None

read_line(path: str, udf: str | Callable = None, n_lines_per_file: int = None, total_lines: int = None, df_args: Dict = None, storage_options: Dict = None, encoding: str = 'utf-8') → Generator[str | List[str], None, None] | DataFrame[source]#

Read each file into lines. If path matches multiple files, will combine lines from all files.

Parameters:

path (str) – path to data files. can have glob pattern.
udf ((callable | str), optional) – user defined function for processing each line, can be a callable or regex, by default None
n_lines_per_file (int, optional) – max number of lines read from each file, by default None
total_lines (int, optional) – max number of lines read from all files, by default None
df_args (dict, optional) – arguments passed to dataframe engine (e.g. pandas), by default None
storage_options (dict, optional) – storage options for cloud storage, by default None
encoding (str, optional) – encoding of files, by default ‘utf-8’

Returns:

returns either a data generator or a dataframe.

Return type:

(Generator | DataFrame)

read_text(path: str, udf: str | Callable = None, total_files: int = None, storage_options: Dict = None, df_args: Dict = None, encoding: str = 'utf-8') → Generator[str | List[str], None, None] | DataFrame[source]#

Read each file into a text string. If path matches multiple files, each file corresponds to one record.

Parameters:

path (str) – path to data files. can have glob pattern.
udf ((callable | str), optional) – user defined function for processing each line, can be a callable or regex, by default None
total_files (int, optional) – max number of files to read, by default None
df_args (dict, optional) – arguments passed to dataframe engine (e.g. pandas), by default None
storage_options (dict, optional) – storage options for cloud storage, by default None
encoding (str, optional) – encoding of files, by default ‘utf-8’

Returns:

returns either a data generator or a dataframe.

Return type:

(Generator | DataFrame)

with_processor(processor_type: str) → None[source]#

Set file processor.

Parameters:: processor_type (str) – type of processor, which corresponds to format of the file.
Return type:: None

class ads.text_dataset.dataset.TextDatasetFactory[source]#

Bases: object

A class that generates a dataloader given a file format.

static format(format_name: str) → DataLoader[source]#

Instantiates DataLoader class and seeds it with the right kind of FileProcessor. Eg. PDFProcessor for pdf. The FileProcessorFactory returns the processor based on the format Type.

Parameters:: format_name (str) – name of format
Returns:: a DataLoader object.
Return type:: ads.text_dataset.dataset.DataLoader

ads.text_dataset.extractor module#

class ads.text_dataset.extractor.FileProcessor(backend: str | Base = 'default')[source]#

Bases: object

Base class for all the file processor. Files are opened using fsspec library. The default implementation in the base class assumes text files.

This class is expected to be used inside ads.text_dataset.dataset.DataLoader.

backend(backend: str | Base) → None[source]#

Set backend for file processor.

Parameters:: backend (ads.text_dataset.backends.Base) – a backend for file processor
Return type:: None
Raises:: NotSupportedError – when specified backend is not supported.

backend_map = {'default': <class 'ads.text_dataset.backends.Base'>, 'tika': <class 'ads.text_dataset.backends.Tika'>}#

convert_to_text(fhandler: OpenFile, dst_path: str, fname: str | None = None, storage_options: Dict | None = None) → str[source]#

Convert input file to a text file.

Parameters:

fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
dst_path (str) – local folder or cloud storage (e.g. OCI object storage) prefix to save converted text files
fname (str, optional) – filename for converted output, relative to dirname or prefix, by default None
storage_options (dict, optional) – storage options for cloud storage, by default None

Returns:

path to saved output

Return type:

str

get_metadata(fhandler: OpenFile) → Dict[source]#

Get metadata of a file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Returns:: dictionary of metadata
Return type:: dict

read_line(fhandler: OpenFile, **format_reader_kwargs: Dict) → Generator[str | List[str], None, None][source]#

Yields lines from a file.

Parameters:: fhandler (fsspec.core.OpenFile) – file handler returned by fsspec
Returns:: a generator that yields lines from a file
Return type:: Generator

read_text(fhandler: OpenFile, **format_reader_kwargs: Dict) → Generator[str | List[str], None, None][source]#

Yield contents from the entire file.

Parameters:: fhandler (fsspec.core.OpenFile) – a file handler returned by fsspec
Returns:: a generator that yield text from a file
Return type:: Generator

class ads.text_dataset.extractor.FileProcessorFactory[source]#

Bases: object

Factory that manages all file processors. Provides functionality to get a processor corresponding to a given file type, or register custom processor for a specific file format.

Examples

>>> from ads.text_dataset.extractor import FileProcessor, FileProcessorFactory
>>> FileProcessorFactory.get_processor('pdf')
>>> class CustomProcessor(FileProcessor):
... # custom logic here
... pass
>>> FileProcessorFactory.register('new_format', CustomProcessor)

static get_processor(format)[source]#

processor_map = {'doc': <class 'ads.text_dataset.extractor.WordProcessor'>, 'docx': <class 'ads.text_dataset.extractor.WordProcessor'>, 'pdf': <class 'ads.text_dataset.extractor.PDFProcessor'>, 'txt': <class 'ads.text_dataset.extractor.FileProcessor'>}#

classmethod register(fmt: str, processor: FileProcessor) → None[source]#

Parameters:

fmt (str) – file format
processor (FileProcessor) – custom processor

Raises:

TypeError – raised when processor is not a subclass of FileProcessor.

class ads.text_dataset.extractor.PDFProcessor(backend: str | Base = 'default')[source]#

Bases: FileProcessor

Extracts text content from PDF

backend_map = {'default': <class 'ads.text_dataset.backends.Tika'>, 'pdfplumber': <class 'ads.text_dataset.backends.PDFPlumber'>, 'tika': <class 'ads.text_dataset.backends.Tika'>}#

class ads.text_dataset.extractor.WordProcessor(backend: str | Base = 'default')[source]#

Bases: FileProcessor

Extracts text content from doc or docx format.

backend_map = {'default': <class 'ads.text_dataset.backends.Tika'>, 'tika': <class 'ads.text_dataset.backends.Tika'>}#

ads.text_dataset.options module#

class ads.text_dataset.options.FileOption(dataloader: ads.text_dataset.dataset.DataLoader)[source]#

Bases: OptionHandler

handle(fhandler: OpenFile, spec: Any) → Any[source]#

class ads.text_dataset.options.MetadataOption(dataloader: ads.text_dataset.dataset.DataLoader)[source]#

Bases: OptionHandler

handle(fhandler: OpenFile, spec: Dict) → List[source]#

class ads.text_dataset.options.OptionFactory[source]#

Bases: object

static option_handler(option: Options) → OptionHandler[source]#

option_handlers = {<Options.FILE_NAME: 1>: <class 'ads.text_dataset.options.FileOption'>, <Options.FILE_METADATA: 2>: <class 'ads.text_dataset.options.MetadataOption'>}#

classmethod register_option(option: Options, handler) → None[source]#

class ads.text_dataset.options.OptionHandler(dataloader: ads.text_dataset.dataset.DataLoader)[source]#

Bases: object

handle(fhandler: OpenFile, spec: Any) → Any[source]#

class ads.text_dataset.options.Options(value)[source]#

Bases: Enum

An enumeration.

FILE_METADATA = 2#

FILE_NAME = 1#

ads.text_dataset.udfs module#

class ads.text_dataset.udfs.UDF[source]#

Bases: object

static from_regex(regex: str) → Callable[source]#

ads.text_dataset.utils module#

exception ads.text_dataset.utils.NotSupportedError[source]#: Bases: Exception

class ads.text_dataset.utils.PY4JGateway[source]#: Bases: object

ads.text_dataset.utils.experimental(cls)[source]#

ads.text_dataset package#

Submodules#

ads.text_dataset.backends module#

ads.text_dataset.dataset module#

ads.text_dataset.extractor module#

ads.text_dataset.options module#

ads.text_dataset.udfs module#

ads.text_dataset.utils module#

Module contents#