Spark ml pipline交叉验证之线性回归
1.1 模型训练
1.1.1 输入参数
{
"modelName ": "航班预测 ",
"numFolds ": "5 ",
"labelColumn ": "ArrDelay ",
"maxIters ": [
10,
20,
50,
100,
200
],
"regParams ": [
0.01,
0.1,
1,
10
],
"elasticNetParams ": [
0,
0.1,
0.5,
0.8,
1
]
}
1.1.2 训练代码
import com.cetc.common.conf.MachineLearnModel
import com.cetc.miner.compute.utils.{ModelUtils, Utils}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.{StandardScaler, VectorAssembler}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{DataFrame, SparkSession}
import scala.collection.JavaConverters._
class LinearRegressionBestTrain {
val logger: org.apache.log4j.Logger = org.apache.log4j.Logger.getLogger(classOf[LinearRegressionBestTrain])
/**
* 线性回归 预测模型训练
* @param df
* @param id
* @param name
* @param conf
* @param sparkSession
* @return
*/
def execute(df: DataFrame, id: String, name: String, conf: String, sparkSession: SparkSession): java.util.List[MachineLearnModel] = {
df.cache()
logger.info("训练集个数========="+ df.count())
val params = Utils.conf2Class(conf)
//ML的VectorAssembler是一个transformer,要求数据类型不能是string,将多列数据转化为单列的向量列,比如把age、income等等字段列合并成一个 userFea 向量列,方便后续训练
val assembler = new VectorAssembler().setInputCols(df.drop(params.getLabelColumn).columns).setOutputCol("features")
//标准化(归一化)
val standardScaler = new StandardScaler()
.setInputCol(assembler.getOutputCol)
.setOutputCol("scaledFeatures")
.setWithStd(true)//是否将数据缩放到单位标准差。
.setWithMean(false)//是否在缩放前使用平均值对数据进行居中。
.fit(assembler.transform(df))
//创建线性回归模型
val lr = new LinearRegression()
.setFeaturesCol(assembler.getOutputCol) // 特征输入
.setLabelCol(params.getLabelColumn) // 要预测的值
//创建机器学习工作流
val pipeline = new Pipeline().setStages(Array(assembler, standardScaler, lr))
//创建回归评估器,用于基于训练集的多次训练后的模型选择
val regressionEvaluator = new RegressionEvaluator()
.setLabelCol(params.getLabelColumn)//真实值
.setPredictionCol("prediction")//模型预测的值
.setMetricName("rmse") // 均方根误差
//获取最大迭代次数和正则参数,一共可以训练出(maxIters*regParams)个模型
import scala.collection.JavaConversions.asScalaBuffer
val paramMap = new ParamGridBuilder()
.addGrid(lr.getParam("maxIter"), asScalaBuffer(params.getMaxIters))
.addGrid(lr.getParam("regParam"), asScalaBuffer(params.getRegParams))
.addGrid(lr.getParam("elasticNetParam"), asScalaBuffer(params.getElasticNetParams))
.build
//创建交叉验证器,他会把训练集分成NumFolds份,然后在其中(NumFolds-1)份里进行训练
//在其中一份里进行测试,针对上面的每一组参数都会训练出NumFolds个模型,最后选择一个
// 最优的模型
val crossValidator = new CrossValidator()
.setEstimator(pipeline)
.setEstimatorParamMaps(paramMap)//设置模型需要的超参数组合
.setNumFolds(params.getNumFolds)//把训练集分成多少份数
.setEvaluator(regressionEvaluator)//设置评估器,用户评估测试结果数据
//模型训练
val model = crossValidator.fit(df)
// 最佳模型
val bestModel = model.bestModel.asInstanceOf[PipelineModel]
val linearModel = bestModel.stages(2).asInstanceOf[LinearRegressionModel]
println("模型类型========", linearModel.getClass)
//将模型封装成对象
val modelObject: MachineLearnModel = ModelUtils.saveModel(linearModel, params.getModelName, 0, conf, 1, 0.0)
//保存模型到数据库
ModelUtils.model2mysql(modelObject)
return List(modelObject).asJava
}
}
1.2 模型评估
1.2.1 输入参数
{"labelColumn":""}
1.2.2 评估代码
import java.util
import com.cetc.common.conf.MachineLearnModel
import com.cetc.miner.compute.ml.train.LinearRegressionBestTrain
import com.cetc.miner.compute.utils.{ModelUtils, Utils}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.ml.regression.LinearRegressionModel
class LinearRegressionAssess {
val logger: org.apache.log4j.Logger = org.apache.log4j.Logger.getLogger(classOf[LinearRegressionBestTrain])
/**
* 线性回归 预测模型评估
* @param df
* @param model
* @param id
* @param name
* @param conf
* @param sparkSession
* @return
*/
def execute(df: DataFrame, model: MachineLearnModel, id: String, name: String, conf: String, sparkSession: SparkSession): java.util.List[Double] = {
logger.info("测试集个数========="+ df.count())
val params = Utils.conf2Class(conf)
val userProfile = Utils.trans2SupervisedLearning(df, params.getLabelColumn)
val linearModel = ModelUtils.loadModel[LinearRegressionModel](model)
val regressionEvaluator = new RegressionEvaluator()
.setLabelCol(params.getLabelColumn)
.setPredictionCol("prediction")
.setMetricName("rmse")
val testDF = linearModel.transform(userProfile)
testDF.show()
val rmse = regressionEvaluator.evaluate(testDF)
logger.info("评估结果 均方根误差 RMSE ========" + rmse)
ModelUtils.updateModel2mysql(model.getName, rmse)
val list = new util.ArrayList[Double]()
list.add(rmse)
return list
}