Source code for ads.common.data

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import numpy as np
from ads.common.utils import _is_dask_dataframe, _is_dask_series


[docs]class ADSData(object): def __init__(self, X=None, y=None, name="", dataset_type=None): r""" This class wraps the input dataframe to various models, evaluation, and explanation frameworks. It's primary purpose is to hold any metadata relevant to these tasks. This can include it's: - X - the independent variables as some dataframe-like structure, - y - the dependent variable or target column as some array-like structure, - name - a string to name the data for user convenience, - dataset_type - the type of the X value. As part of this initiative, ADSData knows how to turn itself into an onnxruntime compatible data structure with the method .to_onnxrt(), which takes and onnx session as input. Parameters ---------- X : Union[pandas.DataFrame, dask.DataFrame, numpy.ndarray, scipy.sparse.csr.csr_matrix] If str, URI for the dataset. The dataset could be read from local or network file system, hdfs, s3 and gcs Should be none if X_train, y_train, X_test, Y_test are provided y: Union[str, pandas.DataFrame, dask.DataFrame, pandas.Series, dask.Series, numpy.ndarray] If str, name of the target in X, otherwise series of labels corresponding to X name: str, optional Name to identify this data dataset_type: ADSDataset optional When this value is available, would be used to evaluate the ads task type kwargs: Additional keyword arguments that would be passed to the underlying Pandas read API. """ self.X = X self.y = y self.name = name self.dataset_type = dataset_type
[docs] @staticmethod def build(X=None, y=None, name="", dataset_type=None, **kwargs): r""" Returns an ADSData object built from the (source, target) or (X,y) Parameters ---------- X : Union[pandas.DataFrame, dask.DataFrame, numpy.ndarray, scipy.sparse.csr.csr_matrix] If str, URI for the dataset. The dataset could be read from local or network file system, hdfs, s3 and gcs Should be none if X_train, y_train, X_test, Y_test are provided y: Union[str, pandas.DataFrame, dask.DataFrame, pandas.Series, dask.Series, numpy.ndarray] If str, name of the target in X, otherwise series of labels corresponding to X name: str, optional Name to identify this data dataset_type: ADSDataset, optional When this value is available, would be used to evaluate the ads task type kwargs: Additional keyword arguments that would be passed to the underlying Pandas read API. Returns ------- ads_data: ads.common.data.ADSData A built ADSData object Examples -------- >>> data = open_csv("my.csv") >>> data_ads = ADSData(data, 'target').build(data, 'target') """ if X is None or y is None: raise ValueError("Both X and y are required.") if _is_dask_dataframe(X): X = X.compute() if _is_dask_series(y): y = y.compute() if dataset_type is None: dataset_type = type(X) if isinstance(y, str): try: return ADSData( X.drop(y, axis=1), X[y], name=name, dataset_type=dataset_type ) except AttributeError: raise ValueError( "If y is a string, then X must be a pandas or dask dataframe" ) else: return ADSData(X, y, name=name, dataset_type=dataset_type)
def __repr__(self): return "%sShape of X:%s\nShape of y:%s" % ( self.name + "\n", str(self.X.shape), str(self.y.shape), )
[docs] def to_onnxrt( self, sess, idx_range=None, model=None, impute_values={}, **kwargs ): # pragma: no cover r""" Returns itself formatted as an input for the onnxruntime session inputs passed in. Parameters ---------- sess: Session The session object idx_range: Range The range of inputs to convert to onnx model: SupportedModel A model that supports being serialized for the onnx runtime. kwargs: additional keyword arguments - sess_inputs - Pass in the output from onnxruntime.InferenceSession("model.onnx").get_inputs() - input_dtypes (list) - If sess_inputs cannot be passed in, pass in the numpy dtypes of each input - input_shapes (list) - If sess_inputs cannot be passed in, pass in the shape of each input - input_names (list) -If sess_inputs cannot be passed in, pass in the name of each input Returns ------- ort: Array array of inputs formatted for the given session. """ if model._underlying_model in ["torch"]: sess_inputs = sess.get_inputs() in_shape, in_name, in_type = [], [], [] for i, ftr in enumerate(sess_inputs): in_type.append(ftr.type) in_shape.append(ftr.shape) in_name.append(ftr.name) ret = {} for i, name in enumerate(in_name): idx_range = (0, len(self.X)) if idx_range is None else idx_range batch_size = idx_range[1] - idx_range[0] ret[name] = ( self.X[:batch_size] .reshape([batch_size] + list(self.X[:1].shape)) .detach() .cpu() .numpy() .astype(np.float32) ) return ret elif model._underlying_model in ["automl"]: X_trans = model._onnx_data_transformer( X=self.X, impute_values=impute_values ) inputs = {} for idx, c in enumerate(X_trans.columns): inputs[sess.get_inputs()[idx].name] = ( X_trans[c] .values.reshape((X_trans.shape[0], 1)) .astype(X_trans.dtypes[idx]) ) return inputs elif model._underlying_model in ["lightgbm", "xgboost", "sklearn"]: idx_range = (0, len(self.X)) if idx_range is None else idx_range inputs = [] for name, row in self.X[idx_range[0] : idx_range[1]].iterrows(): inputs.append(list(row)) return {"input": inputs}