数据预处理
scala> import org.apache.spark.ml.feature.LabeledPoint
scala> import org.apache.spark.ml.linalg.Vectors
val data = records
.map {
r =>
val trimmed = r.map (一 .replaceAll (”\””,””})
val label= trimmed(r.size - l).tornt
val features = trimmed.slice(4, r.size - 1) .map(d => if (d ==”?”) 0. 0
else d.toDouble)
LabeledPoint(label, Vectors.dense(features))}
// RDD创建DataFrame
val df = spark.createDataFrame(data)
// 训练模型
import org.apache.spark.ml.classification.{LogisticRegression,LogisticRegressionModel}
组装
import erg.apache.spark.ml.feature.{ Vectorlndexer, VectorlndexerModel}
val featurelndexer = new Vectorindexer().setlnputCol (” features ” ) .setOutputCol (” indexedFeatures”) .fit(df)
val lr = new Logist i cRegression().setLabelCol ( ” label").setFeaturesCol (” indexedFeatures ”)-
setMaxiter(lO).setRegParam(0.001)
import erg.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.ml .{Pi peline,PipelineModel}
import org.apache.spark.rnl.evaluation.BinaryClassificationEvaluator
scala> val lrPipeline =new Pipeline().setStages(Array(featureindexer,lr))
lrPipeline: erg.apache.spark.ml.Pipeline = pipeline 64c542dff42e
scala> val dtPipeline =new Pipeline().setStages(Array(featureindexer,dt))
dtPipeline: erg.apache.spark.ml.Pipeline = pipeline_b9ed2ccc2108
模型优化
分别配置网格参数,使用 ParamGridBuilder 构造一个 parameter grid:
scala> :paste
val lrPararnGrid =new PararnGridBuilder()
.addGrid(lr.regPararn,Array(O.l,0.3,0.S))
.addGrid(lr.rnaxiter, Array(l0,20,30))
.build()
scala> :paste
val dtPararnGrid =new PararnGridBuilder()
. addGrid(dt.rnaxDepth, Array( 3, 5, 7))
.build()
2 )分别实例化交叉验证模型:
val evaluator= new BinaryClassificationEvaluator()
scala> :paste
val lrCV =new CrossValidator()
.setEstirnator(lrPipeline)
.setEvaluator(evaluator)
.setEstirnatorPararnMaps(lrPararnGrid)
.setNurnFolds(2)
lrCV: org.apache.spark.rnl.tuning.CrossValidator = cv b25c7e0flbe7
scala> :paste
val dtCV =new CrossValidator()
.setEstirnator(dtPipeline)
.setEvaluator(evaluator)
.setEstirnatorPararnMaps(dtPararnGrid)
.setNurnFolds(2)
dtCV: org.apache.spark.rnl.tuning.CrossValidator = cv_Sl76e64260ld
3 )通过交叉验证模型,获取最优参数集,并测试模型:
scala>val lrCvModel = lrCV.fit(trainingData)
lrCvModel: org.apache.spark.ml.tuning.CrossValidatorModel = cv_b25c7e0flbe7
scala> val dtCvModel = dtCV.fit(trainingData)
dtCvModel: org.apache.spark.ml.tuning.CrossValidatorModel = cv_5176e642601d