Source code for ads.model.transformer.onnx_transformer

#!/usr/bin/env python
# -*- coding: utf-8 -*--

# Copyright (c) 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import json
import logging
from typing import Dict, Union

import numpy as np
import pandas as pd

pd.options.mode.chained_assignment = None


# Note to developers: If you make any changes to this class, copy and paste those changes over to
# templates/score_onnx.jinja2 and templates/score_onnx_new.jinja2. We do not yet have an automatic way of doing this.
[docs]class ONNXTransformer(object): """ This is a transformer to convert X [pandas.Dataframe, pd.Series] data into Onnx readable dtypes and formats. It is Serializable, so it can be reloaded at another time. Examples -------- >>> from ads.model.transformer.onnx_transformer import ONNXTransformer >>> onnx_data_transformer = ONNXTransformer() >>> train_transformed = onnx_data_transformer.fit_transform(train.X, {"column_name1": "impute_value1", "column_name2": "impute_value2"}}) >>> test_transformed = onnx_data_transformer.transform(test.X) """ def __init__(self): self.impute_values = {} self.dtypes = None self._fitted = False @staticmethod def _handle_dtypes(X: Union[pd.DataFrame, pd.Series, np.ndarray, list]): """Handles the dtypes for pandas dataframe and pandas Series. Parameters ---------- X : Union[pd.DataFrame, pd.Series, np.ndarray, list] The Dataframe for the training data Returns ------- Union[pd.DataFrame, pd.Series, np.ndarray, list] The transformed(numerical values are cast to float32) X data """ # Data type cast could be expensive doing it in a for loop # Especially with wide datasets # So cast the numerical columns first, without loop # Then impute missing values if isinstance(X, pd.Series): series_name = X.name if X.name else 0 _X = X.to_frame() _X = ONNXTransformer._handle_dtypes_dataframe(_X)[series_name] elif isinstance(X, pd.DataFrame): _X = ONNXTransformer._handle_dtypes_dataframe(X) elif isinstance(X, np.ndarray): _X = ONNXTransformer._handle_dtypes_np_array(X) else: # if users convert pandas dataframe with mixed types to numpy array directly # it will turn the whole numpy array into object even though some columns are # numerical and some are not. In that case, we need to do extra work to identify # which columns are really numerical which for now, we only convert to float32 # if numpy array is all numerical. else, nothing will be done. _X = X return _X @staticmethod def _handle_dtypes_dataframe(X: pd.DataFrame): """handle the dtypes for pandas dataframe. Parameters ---------- X : pandas.DataFrame The Dataframe for the training data Returns ------- pandas.DataFrame The transformed X data """ dict_astype = {} for k, v in zip(X.columns, X.dtypes): if "int" in str(v) or "float" in str(v) or "bool" in str(v): dict_astype[k] = "float32" _X = X.astype(dict_astype) if len(dict_astype) > 0: logging.warning("Numerical values in `X` are cast to float32.") return _X @staticmethod def _handle_dtypes_np_array(X: np.ndarray): """handle the dtypes for pandas dataframe. Parameters ---------- X : np.ndarray The ndarray for the training data Returns ------- np.ndarray The transformed X data """ if "int" in str(X.dtype) or "float" in str(X.dtype) or "bool" in str(X.dtype): _X = X.astype("float32") logging.warning("Numerical values in `X` are cast to float32.") else: _X = X return _X
[docs] def fit( self, X: Union[pd.DataFrame, pd.Series, np.ndarray, list], impute_values: Dict = None, ): """ Fits the OnnxTransformer on the dataset Parameters ---------- X : Union[pandas.DataFrame, pandas.Series, np.ndarray, list] The Dataframe for the training data Returns ------- Self: ads.Model The fitted estimator """ _X = ONNXTransformer._handle_dtypes(X) if isinstance(_X, pd.DataFrame): self.dtypes = _X.dtypes elif isinstance(_X, np.ndarray): self.dtypes = _X.dtype self.impute_values = impute_values if impute_values else {} self._fitted = True return self
[docs] def transform(self, X: Union[pd.DataFrame, pd.Series, np.ndarray, list]): """ Transforms the data for the OnnxTransformer. Parameters ---------- X: Union[pandas.DataFrame, pandas.Series, np.ndarray, list] The Dataframe for the training data Returns ------- Union[pandas.DataFrame, pandas.Series, np.ndarray, list] The transformed X data """ assert self._fitted, "Call fit_transform first!" if self.dtypes is not None and len(self.dtypes) > 0: if isinstance(X, list): _X = np.array(X).astype(self.dtypes).tolist() else: _X = X.astype(self.dtypes) else: _X = X _X = ONNXTransformer._handle_missing_value(_X, impute_values=self.impute_values) return _X
@staticmethod def _handle_missing_value( X: Union[pd.DataFrame, pd.Series, np.ndarray, list], impute_values: Dict ): """Impute missing values in X according to impute_values. Parameters ---------- X: Union[pandas.DataFrame, pandas.Series, np.ndarray, list] The Dataframe for the training data Raises ------ Exception if X has only one dim, but imputed_values has multiple values. NotImplemented if X has the data type that is not supported. Returns ------- Union[pandas.DataFrame, pd.Series, np.ndarray, list] The transformed X data """ if isinstance(X, np.ndarray): X = ONNXTransformer._handle_missing_value_dataframe( pd.DataFrame(X), impute_values=impute_values ).values elif isinstance(X, list): X = ONNXTransformer._handle_missing_value_dataframe( pd.DataFrame(X), impute_values=impute_values ).values.tolist() elif isinstance(X, pd.DataFrame): X = ONNXTransformer._handle_missing_value_dataframe( X, impute_values=impute_values ) elif isinstance(X, pd.Series): X = X.replace(r"^\s*$", np.NaN, regex=True) if len(impute_values.keys()) == 1: for key, val in impute_values.items(): X = X.fillna(val) else: raise Exception( "Multiple imputed values are provided, but `X` has only one dim." ) else: raise NotImplemented( f"{type(X)} is not supported. Convert `X` to pandas dataframe or numpy array." ) return X @staticmethod def _handle_missing_value_dataframe(X: pd.DataFrame, impute_values: Dict): for idx, val in impute_values.items(): if isinstance(idx, int): X.iloc[:, idx] = ( X.iloc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) ) else: X.loc[:, idx] = ( X.loc[:, idx].replace(r"^\s*$", np.NaN, regex=True).fillna(val) ) return X
[docs] def fit_transform( self, X: Union[pd.DataFrame, pd.Series], impute_values: Dict = None ): """ Fits, then transforms the data Parameters ---------- X: Union[pandas.DataFrame, pandas.Series] The Dataframe for the training data Returns ------- Union[pandas.DataFrame, pandas.Series] The transformed X data """ return self.fit(X, impute_values).transform(X)
[docs] def save(self, filename, **kwargs): """ Saves the Onnx model to disk Parameters ---------- filename: Str The filename location for where the model should be saved Returns ------- filename: Str The filename where the model was saved """ export_dict = { "impute_values": { "value": self.impute_values, "dtype": str(type(self.impute_values)), }, "dtypes": {} if self.dtypes is None else { "value": { "index": list(self.dtypes.index), "values": [str(val) for val in self.dtypes.values], } if isinstance(self.dtypes, pd.Series) else str(self.dtypes), "dtype": str(type(self.dtypes)), }, "_fitted": {"value": self._fitted, "dtype": str(type(self._fitted))}, } with open(filename, "w") as f: json.dump(export_dict, f, sort_keys=True, indent=4, separators=(",", ": ")) return filename
[docs] @staticmethod def load(filename, **kwargs): """ Loads the Onnx model to disk Parameters ---------- filename: Str The filename location for where the model should be loaded Returns ------- onnx_transformer: ONNXTransformer The loaded model """ # Make sure you have pandas, numpy, and sklearn imported with open(filename, "r") as f: export_dict = json.load(f) onnx_transformer = ONNXTransformer() for key in export_dict.keys(): if key not in ["impute_values", "dtypes"]: try: setattr(onnx_transformer, key, export_dict[key]["value"]) except Exception as e: print( f"Warning: Failed to reload {key} from {filename} to OnnxTransformer." ) raise e if "value" in export_dict["dtypes"]: if "index" in export_dict["dtypes"]["value"]: onnx_transformer.dtypes = pd.Series( data=[ np.dtype(val) for val in export_dict["dtypes"]["value"]["values"] ], index=export_dict["dtypes"]["value"]["index"], ) else: onnx_transformer.dtypes = export_dict["dtypes"]["value"] else: onnx_transformer.dtypes = {} onnx_transformer.impute_values = export_dict["impute_values"]["value"] return onnx_transformer