Source code for ads.common.data

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
from ads.common.utils import _is_dask_dataframe, _is_dask_series



[docs]
class ADSData(object):
    def __init__(self, X=None, y=None, name="", dataset_type=None):
        r"""
        This class wraps the input dataframe to various models, evaluation, and explanation frameworks.
        It's primary purpose is to hold any metadata relevant to these tasks. This can include it's:

        - X - the independent variables as some dataframe-like structure,
        - y - the dependent variable or target column as some array-like structure,
        - name - a string to name the data for user convenience,
        - dataset_type - the type of the X value.

        As part of this initiative, ADSData knows how to turn itself into an onnxruntime compatible data
        structure with the method .to_onnxrt(), which takes and onnx session as input.

        Parameters
        ----------
        X : Union[pandas.DataFrame, dask.DataFrame, numpy.ndarray, scipy.sparse.csr.csr_matrix]
            If str, URI for the dataset. The dataset could be read from local or network file system, hdfs, s3 and gcs
            Should be none if X_train, y_train, X_test, Y_test are provided
        y: Union[str, pandas.DataFrame, dask.DataFrame, pandas.Series, dask.Series, numpy.ndarray]
            If str, name of the target in X, otherwise series of labels corresponding to X
        name: str, optional
            Name to identify this data
        dataset_type: ADSDataset optional
            When this value is available, would be used to evaluate the ads task type
        kwargs:
            Additional keyword arguments that would be passed to the underlying Pandas read API.
        """
        self.X = X
        self.y = y
        self.name = name
        self.dataset_type = dataset_type


[docs]
    @staticmethod
    def build(X=None, y=None, name="", dataset_type=None, **kwargs):
        r"""
        Returns an ADSData object built from the (source, target) or (X,y)

        Parameters
        ----------
        X : Union[pandas.DataFrame, dask.DataFrame, numpy.ndarray, scipy.sparse.csr.csr_matrix]
            If str, URI for the dataset. The dataset could be read from local or network file system, hdfs, s3 and gcs
            Should be none if X_train, y_train, X_test, Y_test are provided
        y: Union[str, pandas.DataFrame, dask.DataFrame, pandas.Series, dask.Series, numpy.ndarray]
            If str, name of the target in X, otherwise series of labels corresponding to X
        name: str, optional
            Name to identify this data
        dataset_type: ADSDataset, optional
            When this value is available, would be used to evaluate the ads task
            type
        kwargs:
            Additional keyword arguments that would be passed to the underlying Pandas read API.

        Returns
        -------
        ads_data: ads.common.data.ADSData
            A built ADSData object

        Examples
        --------
        >>> data = open_csv("my.csv")

        >>> data_ads = ADSData(data, 'target').build(data, 'target')
        """
        if X is None or y is None:
            raise ValueError("Both X and y are required.")
        if _is_dask_dataframe(X):
            X = X.compute()
        if _is_dask_series(y):
            y = y.compute()
        if dataset_type is None:
            dataset_type = type(X)
        if isinstance(y, str):
            try:
                return ADSData(
                    X.drop(y, axis=1), X[y], name=name, dataset_type=dataset_type
                )
            except AttributeError:
                raise ValueError(
                    "If y is a string, then X must be a pandas or dask dataframe"
                )
        else:
            return ADSData(X, y, name=name, dataset_type=dataset_type)


    def __repr__(self):
        return "%sShape of X:%s\nShape of y:%s" % (
            self.name + "\n",
            str(self.X.shape),
            str(self.y.shape),
        )


[docs]
    def to_onnxrt(
        self, sess, idx_range=None, model=None, impute_values={}, **kwargs
    ):  # pragma: no cover
        r"""
        Returns itself formatted as an input for the onnxruntime session inputs passed in.

        Parameters
        ----------
        sess: Session
            The session object
        idx_range: Range
            The range of inputs to convert to onnx
        model: SupportedModel
            A model that supports being serialized for the onnx runtime.
        kwargs: additional keyword arguments

            - sess_inputs - Pass in the output from onnxruntime.InferenceSession("model.onnx").get_inputs()
            - input_dtypes (list) - If sess_inputs cannot be passed in, pass in the numpy dtypes of each input
            - input_shapes (list) - If sess_inputs cannot be passed in, pass in the shape of each input
            - input_names (list) -If sess_inputs cannot be passed in, pass in the name of each input

        Returns
        -------
        ort: Array
            array of inputs formatted for the given session.
        """
        if model._underlying_model in ["torch"]:
            sess_inputs = sess.get_inputs()
            in_shape, in_name, in_type = [], [], []
            for i, ftr in enumerate(sess_inputs):
                in_type.append(ftr.type)
                in_shape.append(ftr.shape)
                in_name.append(ftr.name)
            ret = {}
            for i, name in enumerate(in_name):
                idx_range = (0, len(self.X)) if idx_range is None else idx_range
                batch_size = idx_range[1] - idx_range[0]
                ret[name] = (
                    self.X[:batch_size]
                    .reshape([batch_size] + list(self.X[:1].shape))
                    .detach()
                    .cpu()
                    .numpy()
                    .astype(np.float32)
                )
            return ret
        elif model._underlying_model in ["automl"]:
            X_trans = model._onnx_data_transformer(
                X=self.X, impute_values=impute_values
            )
            inputs = {}
            for idx, c in enumerate(X_trans.columns):
                inputs[sess.get_inputs()[idx].name] = (
                    X_trans[c]
                    .values.reshape((X_trans.shape[0], 1))
                    .astype(X_trans.dtypes[idx])
                )
            return inputs
        elif model._underlying_model in ["lightgbm", "xgboost", "sklearn"]:
            idx_range = (0, len(self.X)) if idx_range is None else idx_range
            inputs = []
            for name, row in self.X[idx_range[0] : idx_range[1]].iterrows():
                inputs.append(list(row))
            return {"input": inputs}