一 模型选择
二 交叉验证
交叉验证
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.ml.linalg.Vector
object CrossValidatorDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("sparksql").master("local[4]").
config("spark.some.config.option","some-value").config("spark.testing.memory", "2147480000").getOrCreate()
val training = spark.createDataFrame(Seq(
(0L,"a b c d e spark",1.0),
(1L,"b d",0.0),
(2L,"spark f g h",1.0),
(3L,"hadoop mapreduce",0.0),
(4L,"b spark who",1.0),
(5L,"g d a y",0.0),
(6L,"spark fly",1.0),
(7L,"was mapreduce",0.0),
(8L,"e spark program",1.0),
(9L,"a e c l",0.0),
(10L,"spark compile",1.0),
(11L,"hadoop software",0.0)
)).toDF("id","text","label")
// 配置一个流水线,该流水线包含3个stage:tokenizer,hashingTF,lr
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val hashingTF = new HashingTF().setInputCol(tokenizer.getOutputCol).setOutputCol("features")
val lr = new LogisticRegression().setMaxIter(10)
val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF,lr))
// 使用ParamGridBuilder构造一个参数网络
val paramGrid = new ParamGridBuilder().addGrid(hashingTF.numFeatures,Array(10,100,1000))
.addGrid(lr.regParam,Array(0.1,0.01)).build()
// 将流水线嵌入到CrossValidator实例中,这样流水线的任务都可使用参数网络。BinaryClassifinationEvaluator默认的评估指标为AUC(areaUnderROC)
val cv = new CrossValidator().setEstimator(pipeline).setEvaluator(new BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)
// 通过交叉验证模型,获取最优参数集,并测试模型
val cvModel = cv.fit(training)
val test = spark.createDataFrame(Seq(
(4L,"spark i j k"),
(5L,"l m n"),
(6L,"mapreduce spark"),
(7L,"apache hadoop")
)).toDF("id","text")
val prediction = cvModel.transform(test)
prediction.select("id","text","probability","prediction")
.collect().foreach {case Row(id:Long,text:String,prob:Vector,prediction:Double) => println(s"($id,$text)-->prob=$prob,prediction=$prediction")}
// 查看最佳模型中的各参数值
val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel]
val lrModel = bestModel.stages(2).asInstanceOf[LogisticRegressionModel]
println(lrModel.getRegParam)
println(lrModel.numFeatures)
}
}
训练交叉验证
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.sql.SparkSession
object TrainValidationSplitDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("sparksql").master("local[4]").
config("spark.some.config.option","some-value").config("spark.testing.memory", "2147480000").getOrCreate()
val data = spark.read.format("libsvm").load("F:/software/spark/spark-2.4.3/data/mllib/sample_linear_regression_data.txt")
val Array(training,test) = data.randomSplit(Array(0.9,0.1),seed = 12345)
val lr = new LinearRegression().setMaxIter(10)
// 使用ParamGridBuilder构建一个搜索参数网格
// TrainValidationSplit将尝试所有值的组合,并确定使用最佳模型
val paramGrid = new ParamGridBuilder().addGrid(lr.regParam,Array(0.1,0.01))
.addGrid(lr.fitIntercept).addGrid(lr.elasticNetParam,Array(0.0,0.5,1.0)).build()
// 在这种情况下,估计量就是线性回归
// TrainValidationSplit需要估计器,一组估计器参数映射和一个评估器
val trainValidationSplit = new TrainValidationSplit().setEstimator(lr)
.setEvaluator(new RegressionEvaluator())
.setEstimatorParamMaps(paramGrid).setTrainRatio(0.8)
// 80%的数据用于培训,其余20%用于验证
// 使用验证拆分训练,并选择具有最佳的一组参数
val model = trainValidationSplit.fit(training)
// 对测试数据进行预测,模型具有最佳性能的参数组合
model.transform(test).select("features","label","prediction").show()
println(model.parent.extractParamMap())
}
}