模型调参-网格搜索Scala实现_cvbestmodel.stages.last.asinstanceof-CSDN博客

本文链接：https://blog.csdn.net/wendaomudong_l2d4/article/details/105102845

本文主要内容如下：

介绍Scala中使用网格搜索的流程
使用Pipeline对代码做简单的整合
网格搜索采用并行网格搜索
输出每个超参组合的avgMetric
对源码做一点解析

版本信息：

<scala.version>2.11.12</scala.version>
<spark.version>2.4.3</spark.version>

数据构造

import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
val builder = SparkSession
      .builder()
      .appName("LR")
      .config("spark.executor.heartbeatInterval","60s")
      .config("spark.network.timeout","120s")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.kryoserializer.buffer.max","512m")
      .config("spark.dynamicAllocation.enabled", false)
      .config("spark.sql.inMemoryColumnarStorage.compressed", true)
      .config("spark.sql.inMemoryColumnarStorage.batchSize", 10000)
      .config("spark.sql.broadcastTimeout", 600)
      .config("spark.sql.autoBroadcastJoinThreshold", -1)
      .config("spark.sql.crossJoin.enabled", true)
      .master("local[*]") 
val spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
import spark.implicits._

import org.apache.spark.ml.{Model, Pipeline, PipelineModel, PipelineStage}
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.CrossValidator
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.sql.functions._
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
builder: org.apache.spark.sql.SparkSession.Builder = org.apache.spark.sql.SparkSession$Builder@44169a0f
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@14fe1df4
import spark.implicits._

var dfTrain = Seq(
    (1, 5.1, 3.5, 1.4, 0.2, 0),
    (2, 4.9, 3.0, 1.4, 0.2, 1),
    (3, 4.7, 3.2, 1.3, 0.2, 0),
    (4, 4.6, 3.1, 1.5, 0.2, 1),
    (5, 5.0, 3.6, 1.4, 0.2, 0),
    (56, 5.7, 2.8, 4.5, 1.3,1),
    (57, 5.3, 3.3, 4.7, 1.6,0),
    (58, 4.9, 2.4, 3.3, 1.0,1),
    (59, 6.6, 3.9, 4.6, 1.3,1),
    (60, 5.2, 2.7, 3.9, 1.4,0)
  ).toDF("id","x1","x2", "x3","x4","label")
// 测试集直接copy就行了，仅用来测试
var dfTest = dfTrain
dfTrain.show()

+---+---+---+---+---+-----+
| id| x1| x2| x3| x4|label|
+---+---+---+---+---+-----+
|  1|5.1|3.5|1.4|0.2|    0|
|  2|4.9|3.0|1.4|0.2|    1|
|  3|4.7|3.2|1.3|0.2|    0|
|  4|4.6|3.1|1.5|0.2|    1|
|  5|5.0|3.6|1.4|0.2|    0|
| 56|5.7|2.8|4.5|1.3|    1|
| 57|5.3|3.3|4.7|1.6|    0|
| 58|4.9|2.4|3.3|1.0|    1|
| 59|6.6|3.9|4.6|1.3|    1|
| 60|5.2|2.7|3.9|1.4|    0|
+---+---+---+---+---+-----+






dfTrain: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]
dfTest: org.apache.spark.sql.DataFrame = [id: int, x1: double ... 4 more fields]

Pipeline

本文重点是梳理scala网格搜索功能，所以没有复杂的预处理~

// 数据转换
val assemble = new VectorAssembler()
      .setInputCols(Array("x1","x2","x3","x4"))
      .setOutputCol("features")
// 模型 
val lr = new LogisticRegression().
            setMaxIter(10).
            setRegParam(0.01)
// 模型
val pipeline = new Pipeline().setStages(Array(assemble, lr))

assemble: org.apache.spark.ml.feature.VectorAssembler = vecAssembler_74a3344707a6
lr: org.apache.spark.ml.classification.LogisticRegression = logreg_ca16d22b39e3
pipeline: org.apache.spark.ml.Pipeline = pipeline_097ca523b766

网格搜索

网格参数设置

参数含义不多介绍，重点是把cv+网格搜索的流程跑通

val elasticNetParamArray =Array(0.2,0.3,0.5)
val regParamArray =Array(0.01, 0.1)
val paramGrid = new ParamGridBuilder().
      addGrid(lr.elasticNetParam,elasticNetParamArray ).
      addGrid(lr.regParam,regParamArray ).
      build()

elasticNetParamArray: Array[Double] = Array(0.2, 0.3, 0.5)
regParamArray: Array[Double] = Array(0.01, 0.1)
paramGrid: Array[org.apache.spark.ml.param.ParamMap] =
Array({
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.01
}, {
	logreg_ca16d22b39e3-elasticNetParam: 0.3,
	logreg_ca16d22b39e3-regParam: 0.01
}, {
	logreg_ca16d22b39e3-elasticNetParam: 0.5,
	logreg_ca16d22b39e3-regParam: 0.01
}, {
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.1
}, {
	logreg_ca16d22b39e3-elasticNetParam: 0.3,
	logreg_ca16d22b39e3-regParam: 0.1
}, {
	logreg_ca16d22b39e3-elasticNetParam: 0.5,
	logreg_ca16d22b39e3-regParam: 0.1
})

这里需要关注下paramGrid是一个Array，里面的元素是ParamMap

CV设置

CV设置主要参数为：

K折交叉验证的K
评估指标

val evaluator = new BinaryClassificationEvaluator()
val evaluator1 =
  new BinaryClassificationEvaluator()
    .setLabelCol("label") // 这些都是可以设置的，一般不用改，都去默认的就可以
    .setRawPredictionCol("rawPrediction")
    .setMetricName("areaUnderROC") // (supports "areaUnderROC" (default), "areaUnderPR")
// 准确率
// val evaluator1 = new MulticlassMetrics.MulticlassClassificationEvaluator()
//       .setLabelCol("label")
//       .setPredictionCol("prediction")
//       .setMetricName("accuracy")
// AUC
//        new MulticlassClassificationEvaluator()
//          .setLabelCol("label")
//          .setPredictionCol("prediction")
//          .setMetricName("f1")//(supports "f1" (default), "weightedPrecision", "weightedRecall", "accuracy")
val cv = new CrossValidator()
  .setEstimator(pipeline)
  .setEvaluator(evaluator1) //评估指标
  .setEstimatorParamMaps(paramGrid) //网格搜索个数
  .setNumFolds(3) // Use 3+ in practice，3折交叉验证  
// 执行cv
val moedel = cv.fit(dfTrain)

evaluator: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_9b1e4521d5f8
evaluator1: org.apache.spark.ml.evaluation.BinaryClassificationEvaluator = binEval_bab72a44f141
cv: org.apache.spark.ml.tuning.CrossValidator = cv_81804eea4531
moedel: org.apache.spark.ml.tuning.CrossValidatorModel = cv_81804eea4531

CV结果

做完交叉验证，我们一般想知道：

每个参数组合的avgMetric是多少
最优参数组合是什么//某种意义上和前面一致
最优模型，及其参数

其中avgMetric是可以从cv中直接获取的，它的顺序和paramGrid是一样的。最优模型可以通过bestModel属性获得。直接看代码吧

// zip之后生成Array，每个元素都是Tuple2
// Tuple2._1是网格搜索的维度组合，ParamMap类型；Tuple2._2是该参数组合cv之后avgMetric
// 按照avgMetric升序排序
val combined = paramGrid.zip(moedel.avgMetrics).sortBy(_._2)
// 输出交叉验证的结果
println(s"validation result:${combined.toList}")
// 输出metric的max和min值,不同的场景最优的avgMetric可能是最大值也可能是最小值
println("*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*")
println(combined.maxBy(_._2))
println(combined.minBy(_._2))
println("*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*")

validation result:List(({
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.1
},0.4444444444444445), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.3,
	logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.5,
	logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.3,
	logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.5,
	logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778))
*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*
({
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778)
({
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.1
},0.4444444444444445)
*=*=*=*=*=*=*=*=*=*=*=*CV:Max Min avgMetric=*=*=*=*=*=*=*=*=*=*=*=*=*





combined: Array[(org.apache.spark.ml.param.ParamMap, Double)] =
Array(({
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.1
},0.4444444444444445), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.3,
	logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.5,
	logreg_ca16d22b39e3-regParam: 0.01
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.3,
	logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778), ({
	logreg_ca16d22b39e3-elasticNetParam: 0.5,
	logreg_ca16d22b39e3-regParam: 0.1
},0.7777777777777778))

最优模型

val bestModel = moedel.bestModel

bestModel: org.apache.spark.ml.Model[_] = pipeline_097ca523b766

输出模型参数

bestModel.asInstanceOf[PipelineModel].stages.last.extractParamMap

res19: org.apache.spark.ml.param.ParamMap =
{
	logreg_ca16d22b39e3-aggregationDepth: 2,
	logreg_ca16d22b39e3-elasticNetParam: 0.2,
	logreg_ca16d22b39e3-family: auto,
	logreg_ca16d22b39e3-featuresCol: features,
	logreg_ca16d22b39e3-fitIntercept: true,
	logreg_ca16d22b39e3-labelCol: label,
	logreg_ca16d22b39e3-maxIter: 10,
	logreg_ca16d22b39e3-predictionCol: prediction,
	logreg_ca16d22b39e3-probabilityCol: probability,
	logreg_ca16d22b39e3-rawPredictionCol: rawPrediction,
	logreg_ca16d22b39e3-regParam: 0.01,
	logreg_ca16d22b39e3-standardization: true,
	logreg_ca16d22b39e3-threshold: 0.5,
	logreg_ca16d22b39e3-tol: 1.0E-6
}

预测

bestModel.transform(dfTest).show()

+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
| id| x1| x2| x3| x4|label|         features|       rawPrediction|         probability|prediction|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+
|  1|5.1|3.5|1.4|0.2|    0|[5.1,3.5,1.4,0.2]|[0.55620579640354...|[0.63557418108330...|       0.0|
|  2|4.9|3.0|1.4|0.2|    1|[4.9,3.0,1.4,0.2]|[0.34335905127309...|[0.58500624519646...|       0.0|
|  3|4.7|3.2|1.3|0.2|    0|[4.7,3.2,1.3,0.2]|[0.49785564365443...|[0.62195526732813...|       0.0|
|  4|4.6|3.1|1.5|0.2|    1|[4.6,3.1,1.5,0.2]|[0.44268625091370...|[0.60889892409382...|       0.0|
|  5|5.0|3.6|1.4|0.2|    0|[5.0,3.6,1.4,0.2]|[0.62725289428788...|[0.65186630366082...|       0.0|
| 56|5.7|2.8|4.5|1.3|    1|[5.7,2.8,4.5,1.3]|[-0.3603636410167...|[0.41087154155653...|       1.0|
| 57|5.3|3.3|4.7|1.6|    0|[5.3,3.3,4.7,1.6]|[-0.0653032957563...|[0.48367997540527...|       1.0|
| 58|4.9|2.4|3.3|1.0|    1|[4.9,2.4,3.3,1.0]|[-0.2365991812372...|[0.44112459847584...|       1.0|
| 59|6.6|3.9|4.6|1.3|    1|[6.6,3.9,4.6,1.3]|[0.00192705540901...|[0.50048176370316...|       0.0|
| 60|5.2|2.7|3.9|1.4|    0|[5.2,2.7,3.9,1.4]|[-0.2399585647021...|[0.44029656183585...|       1.0|
+---+---+---+---+---+-----+-----------------+--------------------+--------------------+----------+

CV源码

主要是cv里fit方法的源码~

@Since("2.0.0")
  override def fit(dataset: Dataset[_]): CrossValidatorModel = instrumented { instr =>
    val schema = dataset.schema
    transformSchema(schema, logging = true)
    val sparkSession = dataset.sparkSession
    val est = $(estimator)
    val eval = $(evaluator)
    val epm = $(estimatorParamMaps)

    // Create execution context based on $(parallelism)
    val executionContext = getExecutionContext

    instr.logPipelineStage(this)
    instr.logDataset(dataset)
    instr.logParams(this, numFolds, seed, parallelism)
    logTuningParams(instr)

    val collectSubModelsParam = $(collectSubModels)

    var subModels: Option[Array[Array[Model[_]]]] = if (collectSubModelsParam) {
      Some(Array.fill($(numFolds))(Array.fill[Model[_]](epm.length)(null)))
    } else None

    // Compute metrics for each model over each split
    val splits = MLUtils.kFold(dataset.toDF.rdd, $(numFolds), $(seed))
    val metrics = splits.zipWithIndex.map { case ((training, validation), splitIndex) =>
      val trainingDataset = sparkSession.createDataFrame(training, schema).cache()
      val validationDataset = sparkSession.createDataFrame(validation, schema).cache()
      instr.logDebug(s"Train split $splitIndex with multiple sets of parameters.")

      // Fit models in a Future for training in parallel
      val foldMetricFutures = epm.zipWithIndex.map { case (paramMap, paramIndex) =>
        Future[Double] {
          val model = est.fit(trainingDataset, paramMap).asInstanceOf[Model[_]]
          if (collectSubModelsParam) {
            subModels.get(splitIndex)(paramIndex) = model
          }
          // TODO: duplicate evaluator to take extra params from input
          val metric = eval.evaluate(model.transform(validationDataset, paramMap))
          instr.logDebug(s"Got metric $metric for model trained with $paramMap.")
          metric
        } (executionContext)
      }

      // Wait for metrics to be calculated
      val foldMetrics = foldMetricFutures.map(ThreadUtils.awaitResult(_, Duration.Inf))

      // Unpersist training & validation set once all metrics have been produced
      trainingDataset.unpersist()
      validationDataset.unpersist()
      foldMetrics
    }.transpose.map(_.sum / $(numFolds)) // Calculate average metric over all splits

    instr.logInfo(s"Average cross-validation metrics: ${metrics.toSeq}")
    val (bestMetric, bestIndex) =
      if (eval.isLargerBetter) metrics.zipWithIndex.maxBy(_._1)
      else metrics.zipWithIndex.minBy(_._1)
    instr.logInfo(s"Best set of parameters:\n${epm(bestIndex)}")
    instr.logInfo(s"Best cross-validation metric: $bestMetric.")
    val bestModel = est.fit(dataset, epm(bestIndex)).asInstanceOf[Model[_]]
    copyValues(new CrossValidatorModel(uid, bestModel, metrics)
      .setSubModels(subModels).setParent(this))
  }

有些scala的魔法我是看不懂，只把主要逻辑说一下。
整体逻辑：

划分数据集
根据参数组合训练模型，输出avgMetric
找到最优的avgMetric，会用该参数组合用全量数据重新训练，输出最终的模型

CV.fit逻辑：

fit中直接通过$调用了估计器、评价器、超参组合三个参数
调用kFold函数，进行数据切分，返回Array，元素为RDD
- kFold函数需要传入数据rdd、折数numFold、随机数种子seed
- 随机数种子貌似不能自己指定，系统根据啥数据衍生出的一个seed
- 切分数据是通过map循环的方式做的，数据返回结果放在Array里
- 但是Kfold之后好像不是严格的cv数据集，也就是所有验证集无重复且union等于全集
- kFold暂时先这样吧，细节得看sample的逻辑
通过zipWithIndex对split之后的数据集加索引，同时map遍历所有的参数组合
- 先做rdd转换成dataset
- 对该分cv的数据，做所有超参数组合的训练
- 最终返回结果，先是计算每一个验证集在所有的参数组合返回Array，再迭代跑每个数据集，这样得到的是Array(Array(验证集1上参数组合A的metric，验证集1上参数组合B的metric)…)。使用transpose方法，转置得Array(Array(参数组合1下验证集A效果,参数组合2下验证集B效果))
- 对transpose之后的array进行map遍历，计算子元素sum/nfolds之后的结果。也就是各个参数组合在所有验证集上的avgMetric
找到最优的参数组合，并且用全量数据再训练一次模型

2020-03-25 于南京市江宁区九龙湖