本文主要内容如下:
- 介绍Scala中使用网格搜索的流程
- 使用Pipeline对代码做简单的整合
- 网格搜索采用并行网格搜索
- 输出每个超参组合的avgMetric
- 对源码做一点解析
版本信息:
- <scala.version>2.11.12</scala.version>
- <spark.version>2.4.3</spark.version>
数据构造
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
val builder = SparkSession
.builder()
.appName("LR")
.config("spark.executor.heartbeatInterval","60s")
.config("spark.network.timeout","120s")
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.config("spark.kryoserializer.buffer.max","512m")
.config("spark.dynamicAllocation.enabled", false)
.config("spark.sql.inMemoryColumnarStorage.compressed", true)
.config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
.config("spark.sql.broadcastTimeout", 600)
.config("spark.sql.autoBroadcastJoinThreshold", -1)
.config("spark.sql.crossJoin.enabled", true)
.master("local[*]")
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._
import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@44169a0f
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@14fe1df4
import spark.implicits._
var dfTrain = Seq(
(1, 5.1, 3.5, 1.4, 0.2, 0),
(2, 4.9, 3.0, 1.4, 0.2, 1),
(3, 4.7, 3.2, 1.3, 0.2, 0),
(4, 4.6, 3.1, 1.5, 0.2, 1),
(5, 5.0, 3.6, 1.4, 0.2, 0),
(56, 5.7, 2.8, 4.5, 1.3,1),
(57, 5.3, 3.3, 4.7, 1.6,0),
(58, 4.9, 2.4, 3.3, 1.0,1),
(59, 6.6, 3.9, 4.6, 1.3,1),
(60, 5.2, 2.7, 3.9, 1.4,0)
).toDF("id","x1","x2", "x3","x4","label")
// 测试集直接copy就行了,仅用来测试
var dfTest = dfTrain
dfTrain.show()
+---+---+---+---+---+-----+
| id| x1| x2| x3| x4|label|
+---+---+---+---+---+-----+
| 1|5.1|3.5|1.4|0.2| 0|
| 2|4.9|3.0|1.4|0.2| 1|
| 3|4.7|3.2|1.3|0.2| 0|
| 4|4.6|3.1|1.5|0.2| 1|
| 5|5.0|3.6|1.4|0.2| 0|
| 56|5.7|2.8|4.5|1.3| 1|
| 57|5.3|3.3|4.7|1.6| 0|
| 58|4.9|2.4|3.3|1.0| 1|
| 59|6.6|3.9|4.6|1.3| 1|
| 60|5.2|2.7|3.9|1.4| 0|
+---+---+---+---+---+-----+
dfTrain: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
dfTest: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
Pipeline
本文重点是梳理scala网格搜索功能,所以没有复杂的预处理~
// 数据转换
val assemble = new VectorAssembler()
.setInputCols(Array("x1","x2","x3","x4"))
.setOutputCol("features")
// 模型
val lr = new LogisticRegression().
setMaxIter(10).
setRegParam(0.01)
// 模型
val pipeline = new Pipeline().setStages(Array(assemble, lr))
assemble: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_74a3344707a6
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_ca16d22b39e3
pipeline: org.apache.spark.ml.Pipeline = pipeline_097ca523b766
网格搜索
网格参数设置
参数含义不多介绍,重点是把cv+网格搜索的流程跑通
val elasticNetParamArray =Array(0.2,0.3,0.5)
val regParamArray =Array(0.01, 0.1)
val paramGrid = new ParamGridBuilder().
addGrid(lr.elasticNetParam,elasticNetParamArray ).
addGrid(lr.regParam,regParamArray ).
build()
elasticNetParamArray: Array[Double] = Array(0.2, 0.3, 0.5)
regParamArray: Array[Double] = Array(0.01, 0.1)
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.01
}, {
logreg_ca16d22b39e3-elasticNetParam: 0.3,
logreg_ca16d22b39e3-regParam: 0.01
}, {
logreg_ca16d22b39e3-elasticNetParam: 0.5,
logreg_ca16d22b39e3-regParam: 0.01
}, {
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.1
}, {
logreg_ca16d22b39e3-elasticNetParam: 0.3,
logreg_ca16d22b39e3-regParam: 0.1
}, {
logreg_ca16d22b39e3-elasticNetParam: 0.5,
logreg_ca16d22b39e3-regParam: 0.1
})
这里需要关注下paramGrid是一个Array,里面的元素是ParamMap
CV设置
CV设置主要参数为:
- K折交叉验证的K
- 评估指标
val evaluator = new BinaryClassificationEvaluator()
val evaluator1 =
new BinaryClassificationEvaluator()
.setLabelCol("label") // 这些都是可以设置的,一般不用改,都去默认的就可以
.setRawPredictionCol("rawPrediction")
.setMetricName("areaUnderROC") // (supports "areaUnderROC" (default), "areaUnderPR")
// 准确率
// val evaluator1 = new MulticlassMetrics.MulticlassClassificationEvaluator()
// .setLabelCol("label")
// .setPredictionCol("prediction")
// .setMetricName("accuracy")
// AUC
// new MulticlassClassificationEvaluator()
// .setLabelCol("label")
// .setPredictionCol("prediction")
// .setMetricName("f1")//(supports "f1" (default), "weightedPrecision", "weightedRecall", "accuracy")
val cv = new CrossValidator()
.setEstimator(pipeline)
.setEvaluator(evaluator1) //评估指标
.setEstimatorParamMaps(paramGrid) //网格搜索个数
.setNumFolds(3) // Use 3+ in practice,3折交叉验证
// 执行cv
val moedel = cv.fit(dfTrain)
evaluator: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_9b1e4521d5f8
evaluator1: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_bab72a44f141
cv: org.apache.spark.ml.tuning.CrossValidator = cv_81804eea4531
moedel: org.apache.spark.ml.tuning.CrossValidatorModel = cv_81804eea4531
CV结果
做完交叉验证,我们一般想知道:
- 每个参数组合的avgMetric是多少
- 最优参数组合是什么//某种意义上和前面一致
- 最优模型,及其参数
其中avgMetric是可以从cv中直接获取的,它的顺序和paramGrid是一样的。最优模型可以通过bestModel属性获得。直接看代码吧
// zip之后生成Array,每个元素都是Tuple2
// Tuple2._1是网格搜索的维度组合,ParamMap类型;Tuple2._2是该参数组合cv之后avgMetric
// 按照avgMetric升序排序
val combined = paramGrid.zip(moedel.avgMetrics).sortBy(_._2)
// 输出交叉验证的结果
println(s"validation result:${combined.toList}")
// 输出metric的max和min值,不同的场景最优的avgMetric可能是最大值也可能是最小值
println("*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*")
println(combined.maxBy(_._2))
println(combined.minBy(_._2))
println("*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*")
validation result:List(({
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.1
},0.4444444444444445), ({
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.3,
logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.5,
logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.3,
logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.5,
logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778))
*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*
({
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778)
({
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.1
},0.4444444444444445)
*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*
combined: Array[(org.apache.spark.ml.param.ParamMap, Double)] =
Array(({
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.1
},0.4444444444444445), ({
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.3,
logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.5,
logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.3,
logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778), ({
logreg_ca16d22b39e3-elasticNetParam: 0.5,
logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778))
最优模型
val bestModel = moedel.bestModel
bestModel: org.apache.spark.ml.Model[_] = pipeline_097ca523b766
输出模型参数
bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap
res19: org.apache.spark.ml.param.ParamMap =
{
logreg_ca16d22b39e3-aggregationDepth: 2,
logreg_ca16d22b39e3-elasticNetParam: 0.2,
logreg_ca16d22b39e3-family: auto,
logreg_ca16d22b39e3-featuresCol: features,
logreg_ca16d22b39e3-fitIntercept: true,
logreg_ca16d22b39e3-labelCol: label,
logreg_ca16d22b39e3-maxIter: 10,
logreg_ca16d22b39e3-predictionCol: prediction,
logreg_ca16d22b39e3-probabilityCol: probability,
logreg_ca16d22b39e3-rawPredictionCol: rawPrediction,
logreg_ca16d22b39e3-regParam: 0.01,
logreg_ca16d22b39e3-standardization: true,
logreg_ca16d22b39e3-threshold: 0.5,
logreg_ca16d22b39e3-tol: 1.0E-6
}
预测
bestModel.transform(dfTest).show()
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
| id| x1| x2| x3| x4|label| features| rawPrediction| probability|prediction|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
| 1|5.1|3.5|1.4|0.2| 0|[5.1,3.5,1.4,0.2]|[0.55620579640354...|[0.63557418108330...| 0.0|
| 2|4.9|3.0|1.4|0.2| 1|[4.9,3.0,1.4,0.2]|[0.34335905127309...|[0.58500624519646...| 0.0|
| 3|4.7|3.2|1.3|0.2| 0|[4.7,3.2,1.3,0.2]|[0.49785564365443...|[0.62195526732813...| 0.0|
| 4|4.6|3.1|1.5|0.2| 1|[4.6,3.1,1.5,0.2]|[0.44268625091370...|[0.60889892409382...| 0.0|
| 5|5.0|3.6|1.4|0.2| 0|[5.0,3.6,1.4,0.2]|[0.62725289428788...|[0.65186630366082...| 0.0|
| 56|5.7|2.8|4.5|1.3| 1|[5.7,2.8,4.5,1.3]|[-0.3603636410167...|[0.41087154155653...| 1.0|
| 57|5.3|3.3|4.7|1.6| 0|[5.3,3.3,4.7,1.6]|[-0.0653032957563...|[0.48367997540527...| 1.0|
| 58|4.9|2.4|3.3|1.0| 1|[4.9,2.4,3.3,1.0]|[-0.2365991812372...|[0.44112459847584...| 1.0|
| 59|6.6|3.9|4.6|1.3| 1|[6.6,3.9,4.6,1.3]|[0.00192705540901...|[0.50048176370316...| 0.0|
| 60|5.2|2.7|3.9|1.4| 0|[5.2,2.7,3.9,1.4]|[-0.2399585647021...|[0.44029656183585...| 1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
CV源码
主要是cv里fit方法的源码~
@Since("2.0.0")
override def fit(dataset: Dataset[_]): CrossValidatorModel = instrumented { instr =>
val schema = dataset.schema
transformSchema(schema, logging = true)
val sparkSession = dataset.sparkSession
val est = $(estimator)
val eval = $(evaluator)
val epm = $(estimatorParamMaps)
// Create execution context based on $(parallelism)
val executionContext = getExecutionContext
instr.logPipelineStage(this)
instr.logDataset(dataset)
instr.logParams(this, numFolds, seed, parallelism)
logTuningParams(instr)
val collectSubModelsParam = $(collectSubModels)
var subModels: Option[Array[Array[Model[_]]]] = if (collectSubModelsParam) {
Some(Array.fill($(numFolds))(Array.fill[Model[_]](epm.length)(null)))
} else None
// Compute metrics for each model over each split
val splits = MLUtils.kFold(dataset.toDF.rdd, $(numFolds), $(seed))
val metrics = splits.zipWithIndex.map { case ((training, validation), splitIndex) =>
val trainingDataset = sparkSession.createDataFrame(training, schema).cache()
val validationDataset = sparkSession.createDataFrame(validation, schema).cache()
instr.logDebug(s"Train split $splitIndex with multiple sets of parameters.")
// Fit models in a Future for training in parallel
val foldMetricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
Future[Double] {
val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
if (collectSubModelsParam) {
subModels.get(splitIndex)(paramIndex) = model
}
// TODO: duplicate evaluator to take extra params from input
val metric = eval.evaluate(model.transform(validationDataset, paramMap))
instr.logDebug(s"Got metric $metric for model trained with $paramMap.")
metric
} (executionContext)
}
// Wait for metrics to be calculated
val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))
// Unpersist training & validation set once all metrics have been produced
trainingDataset.unpersist()
validationDataset.unpersist()
foldMetrics
}.transpose.map(_.sum / $(numFolds)) // Calculate average metric over all splits
instr.logInfo(s"Average cross-validation metrics: ${metrics.toSeq}")
val (bestMetric, bestIndex) =
if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
else metrics.zipWithIndex.minBy(_._1)
instr.logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
instr.logInfo(s"Best cross-validation metric: $bestMetric.")
val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
copyValues(new CrossValidatorModel(uid, bestModel, metrics)
.setSubModels(subModels).setParent(this))
}
有些scala的魔法我是看不懂,只把主要逻辑说一下。
整体逻辑:
- 划分数据集
- 根据参数组合训练模型,输出avgMetric
- 找到最优的avgMetric,会用该参数组合用全量数据重新训练,输出最终的模型
CV.fit逻辑:
- fit中直接通过$调用了估计器、评价器、超参组合三个参数
- 调用kFold函数,进行数据切分,返回Array,元素为RDD
- kFold函数需要传入数据rdd、折数numFold、随机数种子seed
- 随机数种子貌似不能自己指定,系统根据啥数据衍生出的一个seed
- 切分数据是通过map循环的方式做的,数据返回结果放在Array里
- 但是Kfold之后好像不是严格的cv数据集,也就是所有验证集无重复且union等于全集
- kFold暂时先这样吧,细节得看sample的逻辑
- 通过zipWithIndex对split之后的数据集加索引,同时map遍历所有的参数组合
- 先做rdd转换成dataset
- 对该分cv的数据,做所有超参数组合的训练
- 最终返回结果,先是计算每一个验证集在所有的参数组合返回Array,再迭代跑每个数据集,这样得到的是Array(Array(验证集1上参数组合A的metric,验证集1上参数组合B的metric)…)。使用transpose方法,转置得Array(Array(参数组合1下验证集A效果,参数组合2下验证集B效果))
- 对transpose之后的array进行map遍历,计算子元素sum/nfolds之后的结果。也就是各个参数组合在所有验证集上的avgMetric
- 找到最优的参数组合,并且用全量数据再训练一次模型
2020-03-25 于南京市江宁区九龙湖