Source code for ads.dataset.label_encoder

#!/usr/bin/env python
# -*- coding: utf-8; -*-

# Copyright (c) 2020, 2022 Oracle and/or its affiliates.
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/

import bisect
from collections import defaultdict

from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder


[docs]class DataFrameLabelEncoder(TransformerMixin): """ Label encoder for pandas.dataframe. dask.dataframe.core.DataFrame """ def __init__(self): self.label_encoders = defaultdict(LabelEncoder)
[docs] def fit(self, X): """ Fits a DataFrameLAbelEncoder. """ for column in X.columns: if X[column].dtype.name in ["object", "category"]: X[column] = X[column].astype(str) self.label_encoders[column] = LabelEncoder() self.label_encoders[column].fit(X[column]) label_encoder_classes_ = [ str(class_) for class_ in self.label_encoders[column].classes_.tolist() ] bisect.insort_left(label_encoder_classes_, "unknown") self.label_encoders[column].classes_ = label_encoder_classes_ return self
[docs] def transform(self, X): """ Transforms a dataset using the DataFrameLAbelEncoder. """ categorical_columns = list(self.label_encoders.keys()) if len(categorical_columns) == 0: return X def _label_encode_with_unknown(name, series): return self.label_encoders[name].transform( series.map(lambda x: str(x)).map( lambda x: x if x in self.label_encoders[name].classes_ else "unknown" ) ) X[categorical_columns] = X[categorical_columns].apply( lambda series: _label_encode_with_unknown(series.name, series) ) return X