首先,我们构建一个xgboost模型并存储为PMML形式,使用到的包是sklearn2pmml,可以将sklearn中的模型保存为PMML的形式
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn2pmml import PMMLPipeline
from sklearn_pandas import DataFrameMapper
from sklearn2pmml import sklearn2pmml
df = pd.read_excel('/Users/huoshirui/Desktop/xyworking/pythonData/dataClean/kexin_data_huoshirui.xlsx')
df = df.drop(columns=['mbl_no'])
clf = XGBClassifier(
learning_rate=0.01,
n_estimators=1000,
max_depth=4,
min_child_weight=1,
gamma=0.0001,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
objective='binary:logistic',
nthread=-1,
scale_pos_weight=1,
seed=666)
mapper = DataFrameMapper([
(['kx_output_riskscore'], None),
(['kx_new_risk_0'], None),
(['kx_new_risk_1'], None),
(['kx_new_risk_2'], None),
(['kx_new_risk_3'], None),
(['kx_new_risk_4'], None),
(['kx_new_risk_5'], None),
(['kx_new_risk_6'], None),
(['kx_new_risk_7'], None),
(['kx_new_risk_8'], None),
(['kx_new_risk_11'], None),
(['kx_new_risk_12'], None),
(['kx_new_risk_13'], None),
(['kx_new_risk_14'], None),
(['kx_new_risk_15'], None),
(['kx_new_risk_sumList'], None),
(['kx_new_is_riskList'], None)
])
pipeline = PMMLPipeline([('mapper', mapper), ("classifier", clf)])
pipeline.fit(df[df.columns.difference(["target"])],df["target"])
# 存储为PMML形式
sklearn2pmml(pipeline,"/Users/huoshirui/Desktop/test/PMML/xgboost.pmml",with_repr = True)
然后我们可以在当前文件夹中得到xgboost.pm