Source code for ads.dataset.dataframe_transformer

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

from __future__ import print_function, absolute_import

import inspect

import pandas as pd
from sklearn.base import TransformerMixin


[docs] def expand_lambda_function(lambda_func): """ Returns a lambda function after expansion. """ lambda_function = inspect.getsource(lambda_func).split(",")[1].strip() if lambda_function.endswith(")"): return lambda_function[:-1] return lambda_function
[docs] class DataFrameTransformer(TransformerMixin): """ A DataFrameTransformer object. """ def __init__( self, func_name, target_name, target_sample_val, args=None, kw_args=None ): self.function_name_ = func_name self.target_name = target_name self.target_sample_val = target_sample_val self.function_args_ = args self.function_kwargs_ = kw_args def __repr__(self): return "\n{}({})\n\n{}".format( self.function_name_, self.target_name, "\n".join( [ expand_lambda_function(f) if f.__name__ == "<lambda>" else "{} {}".format(f.__name__, f.__code__.co_varnames) for f in self.function_args_ if callable(f) ] ), )
[docs] def fit(self, df): """ Takes in a DF and returns a fitted model """ return self
[docs] def transform(self, df): """ Takes in a DF and returns a transformed DF """ return self._transform(df)[0]
def _transform(self, df): """ Transform the dataframe using the function provided If a given function is not supported by pandas dataframe, the transform would be a no-op Parameters ---------- df: Union[pandas.DataFrame, dask.dataframe.core.DataFrame] Returns ------- tuple(transformed_df, is_transformed) transformed_df is same type as the input is_transformed, bool True if the transformation function coulfunction_args_d be applied """ # add a dummy target column drop_target = False if self.target_name is not None and self.target_name not in df.columns: drop_target = True df = df.assign(**{self.target_name: self.target_sample_val}) # check if pandas dataframe has this function if hasattr(df, self.function_name_): function = getattr(df, self.function_name_) # if pandas dataframe does not have this function, it is possible that the function being accessed is # similar to dask.dataframe.core.DataFrame.map_partitions that takes python function as the first # argument and applies the function on each partition. The same effect can be achieved using pipe elif ( isinstance(df, pd.DataFrame) and len(self.function_args_) > 0 and callable(self.function_args_[0]) ): function = getattr(df, "pipe") else: # this method cannot be applied on pandas dataframe return df, False function_args = self.function_args_ result = function(*function_args, **self.function_kwargs_) if drop_target: result = result.drop(self.target_name, axis=1) return result, True