spark机器学习五----模型选择与优化

 

一 模型选择

二 交叉验证

交叉验证

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.ml.linalg.Vector

object CrossValidatorDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("sparksql").master("local[4]").
      config("spark.some.config.option","some-value").config("spark.testing.memory", "2147480000").getOrCreate()

    val training = spark.createDataFrame(Seq(
      (0L,"a b c d e spark",1.0),
      (1L,"b d",0.0),
      (2L,"spark f g h",1.0),
      (3L,"hadoop mapreduce",0.0),
      (4L,"b spark who",1.0),
      (5L,"g d a y",0.0),
      (6L,"spark fly",1.0),
      (7L,"was mapreduce",0.0),
      (8L,"e spark program",1.0),
      (9L,"a e c l",0.0),
      (10L,"spark compile",1.0),
      (11L,"hadoop software",0.0)
    )).toDF("id","text","label")
//    配置一个流水线,该流水线包含3个stage:tokenizer,hashingTF,lr
    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
    val hashingTF = new HashingTF().setInputCol(tokenizer.getOutputCol).setOutputCol("features")
    val lr = new LogisticRegression().setMaxIter(10)
    val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF,lr))

//    使用ParamGridBuilder构造一个参数网络
    val paramGrid = new ParamGridBuilder().addGrid(hashingTF.numFeatures,Array(10,100,1000))
      .addGrid(lr.regParam,Array(0.1,0.01)).build()

//    将流水线嵌入到CrossValidator实例中,这样流水线的任务都可使用参数网络。BinaryClassifinationEvaluator默认的评估指标为AUC(areaUnderROC)
    val cv = new CrossValidator().setEstimator(pipeline).setEvaluator(new BinaryClassificationEvaluator()).setEstimatorParamMaps(paramGrid).setNumFolds(2)

//    通过交叉验证模型,获取最优参数集,并测试模型
    val cvModel = cv.fit(training)
    val test = spark.createDataFrame(Seq(
      (4L,"spark i j k"),
      (5L,"l m n"),
      (6L,"mapreduce spark"),
      (7L,"apache hadoop")
    )).toDF("id","text")

    val prediction = cvModel.transform(test)
    prediction.select("id","text","probability","prediction")
      .collect().foreach {case Row(id:Long,text:String,prob:Vector,prediction:Double) => println(s"($id,$text)-->prob=$prob,prediction=$prediction")}

//    查看最佳模型中的各参数值
    val bestModel = cvModel.bestModel.asInstanceOf[PipelineModel]
    val lrModel = bestModel.stages(2).asInstanceOf[LogisticRegressionModel]
    println(lrModel.getRegParam)
    println(lrModel.numFeatures)

  }
}

训练交叉验证 

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.sql.SparkSession

object TrainValidationSplitDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().appName("sparksql").master("local[4]").
      config("spark.some.config.option","some-value").config("spark.testing.memory", "2147480000").getOrCreate()
    val data = spark.read.format("libsvm").load("F:/software/spark/spark-2.4.3/data/mllib/sample_linear_regression_data.txt")
    val Array(training,test) = data.randomSplit(Array(0.9,0.1),seed = 12345)

    val lr = new LinearRegression().setMaxIter(10)

//    使用ParamGridBuilder构建一个搜索参数网格
//    TrainValidationSplit将尝试所有值的组合,并确定使用最佳模型
    val paramGrid = new ParamGridBuilder().addGrid(lr.regParam,Array(0.1,0.01))
                    .addGrid(lr.fitIntercept).addGrid(lr.elasticNetParam,Array(0.0,0.5,1.0)).build()
//    在这种情况下,估计量就是线性回归
//    TrainValidationSplit需要估计器,一组估计器参数映射和一个评估器
    val trainValidationSplit = new TrainValidationSplit().setEstimator(lr)
                              .setEvaluator(new RegressionEvaluator())
                              .setEstimatorParamMaps(paramGrid).setTrainRatio(0.8)
//    80%的数据用于培训,其余20%用于验证

//    使用验证拆分训练,并选择具有最佳的一组参数
    val model = trainValidationSplit.fit(training)
//    对测试数据进行预测,模型具有最佳性能的参数组合
    model.transform(test).select("features","label","prediction").show()
    println(model.parent.extractParamMap())
  }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值