#!/usr/bin/env python# -*- coding: utf-8; -*-# Copyright (c) 2020, 2023 Oracle and/or its affiliates.# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/importbisectimportnumpyasnpfromcollectionsimportdefaultdictfromsklearn.baseimportTransformerMixinfromsklearn.preprocessingimportLabelEncoder
[docs]classDataFrameLabelEncoder(TransformerMixin):""" Label encoder for `pandas.DataFrame` and `dask.dataframe.core.DataFrame`. Attributes ---------- label_encoders : defaultdict Holds the label encoder for each column. Examples -------- >>> import pandas as pd >>> from ads.dataset.label_encoder import DataFrameLabelEncoder >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [3, 4]}) >>> le = DataFrameLabelEncoder() >>> le.fit_transform(X=df) """def__init__(self):"""Initialize an instance of DataFrameLabelEncoder."""self.label_encoders=defaultdict(LabelEncoder)
[docs]deffit(self,X:"pandas.DataFrame"):""" Fits a DataFrameLabelEncoder. Parameters ---------- X : pandas.DataFrame Target values. Returns ------- self : returns an instance of self. Fitted label encoder. """forcolumninX.columns:ifX[column].dtype.namein["object","category","bool"]:X[column]=X[column].astype(str)self.label_encoders[column]=LabelEncoder()self.label_encoders[column].fit(X[column])label_encoder_classes_=[str(class_)forclass_inself.label_encoders[column].classes_.tolist()]bisect.insort_left(label_encoder_classes_,"unknown")label_encoder_classes_=np.asarray(label_encoder_classes_)self.label_encoders[column].classes_=label_encoder_classes_returnself
[docs]deftransform(self,X:"pandas.DataFrame"):""" Transforms a dataset using the DataFrameLabelEncoder. Parameters ---------- X : pandas.DataFrame Target values. Returns ------- pandas.DataFrame Labels as normalized encodings. """categorical_columns=list(self.label_encoders.keys())iflen(categorical_columns)==0:returnXdef_label_encode_with_unknown(name,series):returnself.label_encoders[name].transform(series.map(lambdax:str(x)).map(lambdax:xifxinself.label_encoders[name].classes_else"unknown"))X[categorical_columns]=X[categorical_columns].apply(lambdaseries:_label_encode_with_unknown(series.name,series))returnX