前言
自定义了一个简单的字符变量的变形器,代码如下。
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.pipeline import FeatureUnion, Pipeline
import inspect
import np.array as array
class CategoricalTransformer( BaseEstimator, TransformerMixin ):
def __init__(self, feature_names = None, strategy = None, label_encoder_dict = None,
onehot_encoder_dict = None,target_encoder_dict = None):
args, _, _, values = inspect.getargvalues(inspect.currentframe())
values.pop("self")
for arg, val in values.items():
setattr(self, arg, val)
def one_hot_enc(self, X, y=None):
self.label_encoder_dict = dict()
self.onehot_encoder_dict = dict()
self.new_features = []
encoded_values = dict()
for f in self.feature_names:
label_enc = LabelEncoder()
int_encoded = label_enc.fit_transform(array([str(v) for v in X[f]]))
int_encoded = int_encoded.reshape(len(int_encoded), 1)
onehot_enc = OneHotEncoder(sparse = False, handle_unknown='ignore')
onehot_encoded = onehot_enc.fit_transform(int_encoded)
encoded_values[f] = onehot_encoded
self.onehot_encoder_dict[f] = onehot_enc
self.label_encoder_dict[f] = dict(zip(
label_enc.classes_,
label_enc.transform(label_enc.classes_)
))
self.new_features += [ f + '_' + str(v) for v in list(label_enc.classes_)]
encoded_values = np.concatenate(tuple(array(i) for i encoded_values.values()), axis=1)
X = pd.DataFrame(encoded_values)
X.columns = self.new_features
return X
def target_mean_enc(self, X, y=None, m=0):
mean = y.mean
target = y._name
df = X.copy()
df[target] = y
for f in list(X):
agg = df.groupby(f)[target].agg(['count', 'mean'])
counts = agg['count']
means = agg['mean']
smooth = (counts * means + m * mean) / (counts + m)
self.target_encoder_dict[f] = smooth
df[f] = df[f].map(smooth)
return df
def fit(self, X, y=None):
if not self.feature_names :
self.feature_names = list(X)
if self.strategy == 'logit':
X = self.one_hot_enc(X, y)
else:
X = self.target_enc(X, y)
return self
def transform(self, X, y=None):
if self.strategy == 'logit':
encoded_values = dict()
for f in self.feature_names:
if self.label_encoder_dict and self.label_encoder_dict.get(f, None):
label_enc_dict = self.label_encoder_dict[f]
int_encoded = X[f].apply(lambda x:
label_enc_dict.get(x, 999999999)
).values.reshape(len(X[f], 1)
onehot_enc = self.onehot_encoder_dict[f]
onehot_encoded = onehot_enc.transfrom(int_encoded)
encoded_values[f] = onehot_encoded
encoded_values = np.concatenate(tuple(array(i) for i encoded_values.values()),
axis=1)
X = pd.DataFrame(encoded_values)
X.columns = self.new_features
else:
for f in self.feature_names:
X[f] = X[f].map(self.target_encoder_dict[f])
X[f].replace(np.nan, X[f].mean(), inplace=True)
return X