1、常用特征转换
常用特征转换有sklearn_pandas.DataFrameMapper、sklearn.compose.ColumnTransformer、sklearn.preprocessing.FunctionTransformer等,ColumnTransformer用法如下:
ct = ColumnTransformer([
('ordinal', OrdinalEncoder(), ['gender']), # 类似labelencode,给类别特征加编号
('onehot', OneHotEncoder(sparse=False), ['major']), # onehot
('discretizer', KBinsDiscretizer(n_bins=3), ['age']), # 离散化
('scale', StandardScaler(), ['age']), # 标准化
])
ct.fit_transform(data)
2、能转换成pmml文件的特征转换
但是functiontransformer在转换pmml的时候总是报Java错误,不支持自定义函数转换,因此目前函数转换主要是用到sklearn2pmml.preprocessing相关的特征预处理过程,
其中有两个比较好用
2.1 Aggregator 支持如下函数
class Aggregator(BaseEstimator, TransformerMixin):
"""Aggregate continuous data."""
def __init__(self, function):
functions = ["min", "max", "sum", "prod", "product", "mean", "avg"]
if function not in functions:
raise ValueError("Function {0} not in {1}".format(function, functions))
self.function = function
2.2 ExpressionTransformer 自定义python函数表达式
class ExpressionTransformer(BaseEstimator, TransformerMixin):
"""Transform data using a Python expression."""
def __init__(self, expr, dtype = None):
self.expr = expr
self.dtype = dtype
def _eval_row(self, X):
return eval(self.expr)
def fit(self, X, y = None):
return self
def transform(self, X):
func = lambda x: self._eval_row(x)
Xt = eval_rows(X, func)
if self.dtype is not None:
Xt = cast(Xt, self.dtype)
return _col2d(Xt)
3、具体转换
在系列(1)里面已经介绍过ExpressionTransformer的用法,主要是支持一些简单的python函数,但是再复杂一些,求标准方差,变异系数等的特征变换衍生已经不支持了。为此只能曲线实现,如下就是构造变异系数的ExpressionTransformer
def _get_std_express(num):
mean = '+'.join([f'X[{i}]' for i in range(num)])
mean = '(' + mean+')/'+str(num)
std = 'numpy.sqrt((' + '+'.join([f'numpy.square(X[{i}]-'+mean+')' for i in range(num)])+')/'+str(num-1)+')'
cv = '0 if '+mean+'<=0 else '+std+'/('+mean+')'
return cv
_get_std_express(5)输出如下:
'0 if (X[0]+X[1]+X[2]+X[3]+X[4])/5<=0 else numpy.sqrt((numpy.square(X[0]-(X[0]+X[1]+X[2]+X[3]+X[4])/5)+numpy.square(X[1]-(X[0]+X[1]+X[2]+X[3]+X[4])/5)+numpy.square(X[2]-(X[0]+X[1]+X[2]+X[3]+X[4])/5)+numpy.square(X[3]-(X[0]+X[1]+X[2]+X[3]+X[4])/5)+numpy.square(X[4]-(X[0]+X[1]+X[2]+X[3]+X[4])/5))/4)/((X[0]+X[1]+X[2]+X[3]+X[4])/5)'
至此,我们基本上所有的模型,都能在pmml文件内部实现自定义特征转换
# 保存模型
import joblib
from sklearn.impute import SimpleImputer
from sklearn2pmml.decoration import Alias
from sklearn2pmml import PMMLPipeline,sklearn2pmml
from sklearn_pandas import DataFrameMapper
from sklearn2pmml.preprocessing import ExpressionTransformer,Aggregator
express1 = ExpressionTransformer('X[0]/X[1] if pandas.isnull(X[1]) else(0 if X[1]<=0 else X[0]/X[1])')
impute = SimpleImputer(strategy='constant',fill_value=0)
impute5 = SimpleImputer(strategy='constant',fill_value=0)
def dump(clf, feature, fp):
mapper = DataFrameMapper([
(['BR#DTA0000002#br_als_m3_cell_coon_orgnum','BR#DTA0000002#br_als_m12_cell_coon_orgnum'],[impute,express1],{'alias':'BR#DTA0000002#br_als_cell_coon_orgnum_m3/m12'}),
(['BR#DTA0000002#br_als_d7_cell_nbank_cons_allnum','BR#DTA0000002#br_als_m12_cell_nbank_cons_allnum'],[impute,express1],{'alias':'BR#DTA0000002#br_als_cell_nbank_cons_allnum_d7/m12'}),
(["BR#DTA0000002#br_als_m1_cell_nbank_cf_orgnum","BR#DTA0000002#br_als_m6_cell_nbank_cf_orgnum"], [impute, Alias(express1,'var1'), ExpressionTransformer("-0.372631 if pandas.isnull(X[0]) else(-0.372631 if X[0]<=0.142858 else(-0.250718 if X[0]<=0.375001 else(0.071967 if X[0]<=0.666668 else 0.48501 )))")],{"alias":"W_BR#DTA0000002#br_als_cell_nbank_cf_orgnum_m1/m6"}),
(["BR#DTA0000002#br_als_m6_id_nbank_cons_orgnum","BR#DTA0000002#br_als_m12_id_nbank_cons_orgnum"], [impute, Alias(express1,'var2'), ExpressionTransformer("-1.083908 if pandas.isnull(X[0]) else(-1.083908 if X[0]<=0.600001 else(0.013565 if X[0]<=0.82353 else(0.131677 if X[0]<=0.946258 else 0.295649 )))")],{"alias":"W_BR#DTA0000002#br_als_id_nbank_cons_orgnum_m6/m12"}),
(['BR#DTA0000002#br_alm_m1_cell_nbank_sloan_allnum',
'BR#DTA0000002#br_alm_m2_cell_nbank_sloan_allnum',
'BR#DTA0000002#br_alm_m3_cell_nbank_sloan_allnum',
'BR#DTA0000002#br_alm_m4_cell_nbank_sloan_allnum',
'BR#DTA0000002#br_alm_m5_cell_nbank_sloan_allnum',
],[impute5,ExpressionTransformer(_get_std_express(5))],{'alias':'BR#DTA0000002#br_alm_cell_nbank_sloan_allnum_m5_cv'}),
])
pipeline = PMMLPipeline([
('mapper', mapper),
("classifier", clf)])
pipeline.fit(data_train, data_train['target'])
sklearn2pmml(pipeline, fp + '.pmml', with_repr=True)