用案例解释如何生成PMML模型文件。
在ubuntu python下,拟合XGBoost模型。并在python 中添加 JAVA SDK 环境路径,导出支持JAVA的PMML模型文件,以便用于将模型部署在JAVA 平台。
1.安装ubuntu 版本 jdk
1.1从官网中下载对应的jdk版本
https://download.oracle.com/otn/java/jdk/8u361-b09/0ae14417abb444ebb02b9815e2103550/jdk-8u361-linux-x64.tar.gz?AuthParam=1680089271_bcf63bda2c61968fb00eda136bdea84a
1.2 指定安装包存放地址,并解压
/jvm# cp /quant/联合建模/jdk-8u361-linux-x64.tar.gz java.tar.gz
root@hecs-87892:/usr/lib/jvm# sudo tar -C /usr/lib/jvm -xzf java.tar.gz
root@hecs-87892:/usr/lib/jvm# cd /usr/lib/jvm
root@hecs-87892:/usr/lib/jvm# ls
1.3 激活jdk 配置环境变量
cd /etc
vim profile
#打开profile 后并插入
export JAVA_HOME=/usr/lib/jvm/jdk1.8.0_361
export JRE_HOME=${JAVA_HOME}/jre
export CLASSPATH=.:${JAVA_HOME}/lib:${JRE_HOME}/lib
export PATH=${JAVA_HOME}/bin:$PATH
#完成插入,并:wq保存
source profile
#检查java 是否安装
2.拟合PMML 格式模型
"""
part1:直接用PMML拟合
ubuntu 10.04.4 LTS X86_64
python==3.9
sklearn2pmml==0.91.1
scikit-learn==1.2.2
xgboost==0.90
joblib==1.2.0
numpy==1.20.3
"""
import numpy as np
from sklearn2pmml import *
import joblib
import pandas as pd
from xgboost import XGBClassifier
import os
os.environ['PATH']+=os.pathsep + '/usr/lib/jvm/jdk1.8.0_361/bin' # windows 用小写 path
def to_score(x):
import math
if x <=0.001:
x =0.001
elif x >=0.999:
x =0.999
A = 598.930226033773
B = 43.280851226669
#PD0=25
result = round(A-B*math.log(x/(1-x)),2)
if result < 300:
result=300
if result>900:
result=900
return result
params={'objective': 'binary:logistic',
'base_score': 0.5,
'booster': 'gbtree',
'colsample_bylevel': 1,
'colsample_bynode': 1,
'colsample_bytree': 0.5,
'gamma': 0.0,
'importance_type': 'gain',
'learning_rate': 0.1,
'max_delta_step': 0,
'max_depth': 1,
'min_child_weight': 88,
'missing': np.nan,
'n_estimators': 104,
'random_state': 0,
'reg_alpha': 1e-05,
'reg_lambda': 1e-05,
'scale_pos_weight': 1,
'subsample': 0.5,
'num_leaves': 2}
df = pd.read_csv('第二批授信样本模型跑分结果.csv', encoding='utf-8-sig')
label=pd.read_csv('y_label.csv', encoding='gb2312')
label.columns
label=label.drop_duplicates(subset=('sm3_id'))
df.columns
all_df=df.merge(label[['label','sm3_id']],how='inner',left_on='certif_id_sm3',right_on='sm3_id')
feat=list(all_df.columns[:88])
df[feat]=df[feat].replace(-1,np.nan)
train=all_df[all_df['type']=='train']
X_train=train[feat]
y_train=train['label']
bst=XGBClassifier(**params)
pipeline=PMMLPipeline([('classifier',bst)])
pipeline.fit(X_train,y_train)
proba=pipeline.predict_proba(all_df[feat])[:,1]
score=[ to_score(i) for i in proba]
all_df['new_score']=score
sklearn2pmml(pipeline,r'ysd_sx_func_xgb_4_model.pmml',with_repr=True) #需要安装