机器学习模型导出为PMML的方法
背景
训练好的机器学习模型需要上线部署,需要转为PMML格式。
系统准备
1、python安装sklearn2pmml包;
2、安装java并且添加到环境变量。
以xgboost模型生成PMML文件举例演示
import pandas as pd
from xgboost import XGBClassifier
import sklearn2pmml
from sklearn.datasets import load_breast_cancer
dataset = load_breast_cancer()
X = pd.DataFrame(dataset.data)
X.columns = dataset.feature_names
model = XGBClassifier(n_estimators=5)
pipeline = sklearn2pmml.PMMLPipeline([('classifier',model)])
pipeline.fit(X, dataset.target)
sklearn2pmml.sklearn2pmml(pipeline, 'model.pmml.xml')
生成的PMML格式文件
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<PMML xmlns="http://www.dmg.org/PMML-4_3" xmlns:data="http://jpmml.org/jpmml-model/InlineTable" version="4.3">
<Header>
<Application name="JPMML-SkLearn" version="1.5.14"/>
<Timestamp>2019-06-06T01:51:01Z</Timestamp>
</Header>
<DataDictionary>
<DataField name="y" optype="categorical" dataType="integer">
<Value value="0"/>
<Value value="1"/>
</DataField>
<DataField name="mean texture" optype="continuous" dataType="float"/>
<DataField name="mean smoothness" optype="continuous" dataType="float"/>
<DataField name="mean concave points" optype="continuous" dataType="float"/>
<DataField name="radius error" optype="continuous" dataType="float"/>
<DataField name="texture error" optype="continuous" dataType="float"/>
<DataField name="perimeter error" optype="continuous" dataType="float"/>
<DataField name="area error" optype="continuous" dataType="float"/>
<DataField name="concave points error" optype="continuous" dataType="float"/>
<DataField name="worst radius" optype="continuous" dataType="float"/>
<DataField name="worst texture" optype="continuous" dataType="float"/>
<DataField name="worst perimeter" optype="continuous" dataType="float"/>
<DataField name="worst smoothness" optype="continuous" dataType="float"/>
<DataField name="worst concavity" optype="continuous" dataType="float"/>
<DataField name="worst concave points" optype="continuous" dataType="float"/>
</DataDictionary>
<MiningModel functionName="classification" algorithmName="XGBoost (GBTree)" x-mathContext="float">
<MiningSchema>
<MiningField name="y" usageType="target"/>
<MiningField name="mean texture"/>
<MiningField name="mean smoothness"/>
<MiningField name="mean concave points"/>
<MiningField name="radius error"/>
<MiningField name="texture error"/>
<MiningField name="perimeter error"/>
<MiningField name="area error"/>
<MiningField name="concave points error"/>
<MiningField name="worst radius"/>
<MiningField name="worst texture"/>
<MiningField name="worst perimeter"/>
<MiningField name="worst smoothness"/>
<MiningField name="worst concavity"/>
<MiningField name="worst concave points"/>
</MiningSchema>
<Segmentation multipleModelMethod="modelChain">
<Segment id="1">
<True/>
<MiningModel functionName="regression" x-mathContext="float">
<MiningSchema>
<MiningField name="mean texture"/>
<MiningField name="mean smoothness"/>
<MiningField name="mean concave points"/>
<MiningField name="radius error"/>
<MiningField name="texture error"/>
<MiningField name="perimeter error"/>
<MiningField name="area error"/>
<MiningField name="concave points error"/>
<MiningField name="worst radius"/>
<MiningField name="worst texture"/>
<MiningField name="worst perimeter"/>
<MiningField name="worst smoothness"/>
<MiningField name="worst concavity"/>
<MiningField name="worst concave points"/>
</MiningSchema>
<Output>
<OutputField name="xgbValue" optype="continuous" dataType="float" feature="predictedValue" isFinalResult="false"/>
</Output>
<Segmentation multipleModelMethod="sum">
<Segment id="1">
<True/>
<TreeModel functionName="regression" missingValueStrategy="none" noTrueChildStrategy="returnLastPrediction" splitCharacteristic="multiSplit" x-mathContext="float">
<MiningSchema>
<MiningField name="mean texture"/>
<MiningField name="mean concave points"/>
<MiningField name="radius error"/>
<MiningField name="worst radius"/>
<MiningField name="worst texture"/>
<MiningField name="worst concavity"/>
<MiningField name="worst concave points"/>
</MiningSchema>
<Node score="0.19397591">
<True/>
<Node score="0.13846155">
<SimplePredicate field="worst radius" operator="greaterOrEqual" value="16.795"/>
<Node score="-0.022222223">
<SimplePredicate field="mean texture" operator="greaterOrEqual" value="16.11"/>
<Node score="-0.19534884">
<SimplePredicate field="worst concavity" operator="greaterOrEqual" value="0.1907"/>
</Node>
</Node>
<Node score="-0.13333334">
<SimplePredicate field="mean concave points" operator="greaterOrEqual" value="0.066259995"/>
</Node>
</Node>
<Node score="0.09565218">
<SimplePredicate field="worst concave points" operator="greaterOrEqual" value="0.1358"/>
<Node score="-0.13548388">
<SimplePredicate field="worst texture" operator="greaterOrEqual" value="25.67"/>
</Node>
</Node>
<Node score="0.022222223">
<SimplePredicate field="radius error" operator="greaterOrEqual" value="0.6431"/>
</Node>
</Node>
</TreeModel>
</Segment>
<Segment id="2">
<True/>
<TreeModel functionName="regression" missingValueStrategy="none" noTrueChildStrategy="returnLastPrediction" splitCharacteristic="multiSplit" x-mathContext="float">
<MiningSchema>
<MiningField name="mean concave points"/>
<MiningField name="area error"/>
<MiningField name="worst radius"/>
<MiningField name="worst texture"/>
<MiningField name="worst concavity"/>
<MiningField name="worst concave points"/>
</MiningSchema>
<Node score="0.17654903">
<True/>
<Node score="0.12931876">
<SimplePredicate field="worst radius" operator="greaterOrEqual" value="16.795"/>
<Node score="-0.020989144">
<SimplePredicate field="worst texture" operator="greaterOrEqual" value="19.91"/>
<Node score="-0.17803843">
<SimplePredicate field="worst concavity" operator="greaterOrEqual" value="0.1907"/>
</Node>
</Node>
<Node score="-0.12390131">
<SimplePredicate field="mean concave points" operator="greaterOrEqual" value="0.070820004"/>
</Node>
</Node>
<Node score="0.087922364">
<SimplePredicate field="worst concave points" operator="greaterOrEqual" value="0.1358"/>
<Node score="-0.12419658">
<SimplePredicate field="worst texture" operator="greaterOrEqual" value="25.67"/>
</Node>
</Node>
<Node score="0.019106947">
<SimplePredicate field="area error" operator="greaterOrEqual" value="48.975"/>
</Node>
</Node>
</TreeModel>
</Segment>
<Segment id="3">
<True/>
<TreeModel functionName="regression" missingValueStrategy="none" noTrueChildStrategy="returnLastPrediction" splitCharacteristic="multiSplit" x-mathContext="float">
<MiningSchema>
<MiningField name="mean smoothness"/>
<MiningField name="mean concave points"/>
<MiningField name="concave points error"/>
<MiningField name="worst texture"/>
<MiningField name="worst perimeter"/>
<MiningField name="worst smoothness"/>
<MiningField name="worst concave points"/>
</MiningSchema>
<Node score="0.16095768">
<True/>
<Node score="-0.0440385">
<SimplePredicate field="worst perimeter" operator="greaterOrEqual" value="105.95"/>
<Node score="-0.008407075">
<SimplePredicate field="mean concave points" operator="greaterOrEqual" value="0.048864998"/>
<Node score="-0.16484869">
<SimplePredicate field="worst texture" operator="greaterOrEqual" value="20.645"/>
</Node>
</Node>
<Node score="0.14061172">
<SimplePredicate field="concave points error" operator="greaterOrEqual" value="0.0101255"/>
</Node>
</Node>
<Node score="-0.017814701">
<SimplePredicate field="worst concave points" operator="greaterOrEqual" value="0.1589"/>
<Node score="-0.10992744">
<SimplePredicate field="mean smoothness" operator="greaterOrEqual" value="0.108950004"/>
</Node>
</Node>
<Node score="0.015987273">
<SimplePredicate field="worst smoothness" operator="greaterOrEqual" value="0.17825"/>
</Node>
</Node>
</TreeModel>
</Segment>
<Segment id="4">
<True/>
<TreeModel functionName="regression" missingValueStrategy="none" noTrueChildStrategy="returnLastPrediction" splitCharacteristic="multiSplit" x-mathContext="float">
<MiningSchema>
<MiningField name="mean texture"/>
<MiningField name="mean concave points"/>
<MiningField name="worst radius"/>
<MiningField name="worst perimeter"/>
<MiningField name="worst concavity"/>
<MiningField name="worst concave points"/>
</MiningSchema>
<Node score="0.15114407">
<True/>
<Node score="0.12114004">
<SimplePredicate field="worst radius" operator="greaterOrEqual" value="16.795"/>
<Node score="-0.021503419">
<SimplePredicate field="mean texture" operator="greaterOrEqual" value="16.11"/>
<Node score="-0.15504909">
<SimplePredicate field="worst concavity" operator="greaterOrEqual" value="0.1907"/>
</Node>
</Node>
<Node score="-0.114376836">
<SimplePredicate field="mean concave points" operator="greaterOrEqual" value="0.066259995"/>
</Node>
</Node>
<Node score="-0.013891182">
<SimplePredicate field="worst concave points" operator="greaterOrEqual" value="0.16029999"/>
<Node score="-0.1359692">
<SimplePredicate field="worst perimeter" operator="greaterOrEqual" value="98.26"/>
</Node>
</Node>
<Node score="0.036950365">
<SimplePredicate field="worst concave points" operator="greaterOrEqual" value="0.1358"/>
</Node>
</Node>
</TreeModel>
</Segment>
<Segment id="5">
<True/>
<TreeModel functionName="regression" missingValueStrategy="none" noTrueChildStrategy="returnLastPrediction" splitCharacteristic="multiSplit" x-mathContext="float">
<MiningSchema>
<MiningField name="texture error"/>
<MiningField name="perimeter error"/>
<MiningField name="worst radius"/>
<MiningField name="worst concavity"/>
<MiningField name="worst concave points"/>
</MiningSchema>
<Node score="0.1425488">
<True/>
<Node score="0.05848614">
<SimplePredicate field="worst radius" operator="greaterOrEqual" value="16.795"/>
<Node score="0.013335379">
<SimplePredicate field="texture error" operator="greaterOrEqual" value="0.49229997"/>
<Node score="-0.14551087">
<SimplePredicate field="worst concavity" operator="greaterOrEqual" value="0.1907"/>
</Node>
</Node>
</Node>
<Node score="-0.010981116">
<SimplePredicate field="worst concave points" operator="greaterOrEqual" value="0.16029999"/>
<Node score="-0.12878385">
<SimplePredicate field="perimeter error" operator="greaterOrEqual" value="1.891"/>
</Node>
</Node>
<Node score="0.03379069">
<SimplePredicate field="worst concave points" operator="greaterOrEqual" value="0.1358"/>
</Node>
</Node>
</TreeModel>
</Segment>
</Segmentation>
</MiningModel>
</Segment>
<Segment id="2">
<True/>
<RegressionModel functionName="classification" normalizationMethod="logit" x-mathContext="float">
<MiningSchema>
<MiningField name="y" usageType="target"/>
<MiningField name="xgbValue"/>
</MiningSchema>
<Output>
<OutputField name="probability(0)" optype="continuous" dataType="float" feature="probability" value="0"/>
<OutputField name="probability(1)" optype="continuous" dataType="float" feature="probability" value="1"/>
</Output>
<RegressionTable intercept="0.0" targetCategory="1">
<NumericPredictor name="xgbValue" coefficient="1.0"/>
</RegressionTable>
<RegressionTable intercept="0.0" targetCategory="0"/>
</RegressionModel>
</Segment>
</Segmentation>
</MiningModel>
</PMML>