我们知道模型通过离线方式训练出来后,怎么进行预测呢?如何在线上实时打分。
我们可以将模型结果转为一种叫pmml的格式文件,然后交由openscoring的Web 服务组件来进行线上打分预测。openscoring是一套解决方案(官网是openscoring.io),github上有一位叫vruusmann的大神基于opencoring的组件,实现了一套REST Web服务,用于R,Scikit-Learn和Apache Spark模型的真实实时评分,名字还是叫openscoring,据说耗时小于1ms。
openscoring REST WEB服务代码
Fork 路劲:https://github.com/dearbaba/openscoring
原路径:https://github.com/openscoring/openscoring
java -Dconfig.file=application.conf -Djava.util.logging.config.file=logging.properties -jar openscoring-server-executable-${version}.jar
推送模型:
#lrmodel即为模型的ID
curl -X PUT --data-binary @lrmodel.pmml -H "Content-type: text/xml" http://localhost:8080/openscoring/model/lrmodel
预测打分:
curl -X POST --data-binary @lrmodel.json -H "Content-type: application/json" http://localhost:8080/openscoring/model/lrmodel
具体其它接口请参看GIT文档。
例如,我们的模型是使用spark ML将模型结果保存为一个pmml的格式文件,大致代码如下:
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.DecisionTreeRegressionModel
import org.apache.spark.ml.classification.DecisionTreeClassifier
import org.apache.spark.ml.classification.DecisionTreeClassificationModel
import org.apache.spark.storage.StorageLevel
import org.apache.spark.ml.linalg.{ Vector, Vectors }
import org.apache.commons.lang3.StringUtils
import scala.util.parsing.json.JSONObject
import java.io.FileReader
import scala.io.Source
import scala.util.parsing.json.JSON
import spray.json.JsObject
import org.json4s.jackson.Json
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._
import org.apache.spark.util.LongAccumulator
import cn.pa18.spark.util.DBConnectionDao
import org.apache.spark.sql.Row
import org.apache.spark.broadcast.Broadcast
import scala.util.control.Breaks._
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.ml.feature.CountVectorizer
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.RegexTokenizer
import org.apache.commons.lang3.math.NumberUtils
import com.sun.tools.example.debug.expr.ExpressionParser
import scala.tools.reflect.Eval
import scala.reflect.runtime.currentMirror
import scala.tools.reflect.ToolBox
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.classification.LogisticRegressionModel
import spray.json.JsValue
import spray.json.JsNumber
import java.text.SimpleDateFormat
import java.util.Calendar
import org.apache.spark.sql.SaveMode
import java.util.Date
import org.apache.spark.sql.Dataset
import org.apache.spark.ml.PipelineModel
import org.jpmml.sparkml.PMMLBuilder
import java.io.File
import org.apache.hadoop.fs.Path
def lr_training(hiveContext: SparkSession,appid: String): Unit = {
import hiveContext.implicits._
val data = hiveContext.sql(s"""
select userid,label,b.tagmap from table_name
""".stripMargin).rdd.map(
row => {
val features = ArrayBuffer[String]()
val tdid = row.getAs[String]("userid")
val label = row.getAs[Int]("label") //正负样本标志
val tagmap = row.getAs[scala.collection.immutable.Map[String, String]]("tagmap")
if (tagmap.size > 0) {
for (key lrModel.coefficients(i))
}
kvWeightData += ("intercept_b" -> lrModel.intercept)
kvWeightData += ("auc" -> auc)
//将特征权重保存至hdfs文件
val dateFormat = new SimpleDateFormat("yyyyMMdd")
val todayDate = dateFormat.format(new Date())
val feature_importance_save_path = hdfs_uri+s"/model/result/${todayDate}/"
//保存至HDFS文件
hiveContext.sparkContext.parallelize(kvWeightData.toArray[(String,Double)]).map(row => row._1 + "\t" + row._2).repartition(1).saveAsTextFile(feature_importance_save_path)
//将模型结果保存至PMML文件路径
val hdfs_path = hdfs_uri+s"/data/spark/rym/models/"
//将模型结果保存至PMML文件的名称
val hdfs_file_name = "lrmodel.pmml"
//将模型结果保存至HDFS
save_to_PMML(trainingDF, pipelineModel, hdfs_path, hdfs_file_name)
}
/**
* 将模型保存至PMML文件
*/
def save_to_PMML(trainingDF: Dataset[Row], pipelineModel: PipelineModel, hdfs_path: String, hdfs_file_name: String): Unit = {
println("start to save model to pmml file ... ...")
val pmmlBuilder = new PMMLBuilder(trainingDF.schema, pipelineModel)
pmmlBuilder.buildFile(new File(hdfs_file_name))
val hdfs = org.apache.hadoop.fs.FileSystem.get(new java.net.URI(hdfs_uri), new org.apache.hadoop.conf.Configuration())
if (!HDFSHelper.exists(hdfs, hdfs_path)) {
HDFSHelper.createFolder(hdfs, hdfs_path)
}
val path = new Path(hdfs_file_name)
val dst_path = new Path(hdfs_path)
hdfs.copyFromLocalFile(path, dst_path)
}