spark机器学习二 pipeline工作流_spark pipeline write-CSDN博客

本文链接：https://blog.csdn.net/qq_31583183/article/details/102463421

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.{Row, SparkSession}

object PipelineDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[*]")
          .config("spark.testing.memory", "2147480000").appName("sadas").getOrCreate()
    //    从（标识、特征）元祖开始训练数据
    val training = spark.createDataFrame(Seq(
      (1.0,Vectors.dense(0.0,1.1,0.1)),
      (0.0,Vectors.dense(2.0,1.0,-1.0)),
      (0.0,Vectors.dense(2.0,1.3,1.0)),
      (1.0,Vectors.dense(0.0,1.2,-0.5)))).toDF("label","features")

//    创建logisticRegression实例，是一个估计器
    val lr = new LogisticRegression()
//  打印参数，文档和任何默认值
    println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
//    使用setter方法设置参数
    lr.setMaxIter(10).setRegParam(0.01)

//    训练lr模型，这里使用了存储在lr的参数
    val model1 = lr.fit(training)
//    因为model时模型（即估计器生成的转换器），我们可以查看它在fit（）中使用的参数
    println("Model1 was fit using parameters：" + model1.parent.extractParamMap())

//    可以使用ParamMap指令参数，它支持几种指定参数的方法
    val paramMap = ParamMap(lr.maxIter->20).put(lr.maxIter,30).put(lr.regParam->0.1,lr.threshold->0.55)

//    也可以组合paramMap
    val paramMap2 = ParamMap(lr.probabilityCol->"myProbability")  //修改输出列名
    val paramMapCombined = paramMap ++ paramMap2

//    现在使用paramMapCombined参数学习一个模型
//    paramMapCombined覆盖之前通过lr.set*方法设置的所有参数

    val model2 = lr.fit(training,paramMapCombined)
    println("Model2 was fit using parameters：" + model2.parent.extractParamMap())

//    准备测试数据
    val test = spark.createDataFrame(Seq(
      (1.0,Vectors.dense(-1.0,1.5,1.3)),
      (0.0,Vectors.dense(3.0,2.0,-0.1)),
      (1.0,Vectors.dense(0.0,2.2,-1.5))
    )).toDF("label","features")

//    使用Transformer.transform()方法对测试数据进行预测
//    lr.transform()将仅使用特征列
//    注意：model2.transform()输出一个myProbability列，而不是我们先前通过的probability
    model2.transform(test).select("features","label","myProbability","prediction")
      .collect().foreach {case Row(features:Vector,label:Double,prob:Vector,prediction:Double)=>
      println(s"($features,$label)->prob=$prob,prediction=$prediction")}


  }
}

def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[*]").appName("sadas")
                                      .config("spark.testing.memory", "2147480000").getOrCreate()
    // Prepare training documents from a list of (id, text, label) tuples.
    val training = spark.createDataFrame(Seq(
      (0L,"a b c d e spark", 1.0),
      (1L, "b d", 0.0),
      (2L, "spark f g h", 1.0),
      (3L, "hadoop mapreduce", 0.0)
    )).toDF("id", "text", "label")
    // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")

    val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")


    val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.1)

    val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF,lr))
    // Fit the pipeline to training documents.
    val model = pipeline.fit(training)
    // Now we can optionally save the fitted pipeline to disk
    model.write.overwrite().save("D:/hadoop/spark/mllearn/model")
    // We can also save this unfit pipeline to disk
    pipeline.write.overwrite().save("D:/hadoop/spark/mllearn/pipe")
    // And load it back in during production
    val saveMode = PipelineModel.load("D:/hadoop/spark/mllearn/model")

    // Prepare test documents, which are unlabeled (id, text) tuples.
    val testData = spark.createDataFrame(Seq(
      (4L, "spark i j k"),
      (5L,"l m n"),
      (6L, "spark hadoop spark"),
      (7L, "apache hadoop")
    )).toDF("id","text")
    // Make predictions on test documents.
    model.transform(testData).select("id", "text", "probability", "prediction").collect()
      .foreach {case Row(id:Long,text:String, prob:Vector,prediction:Double)=>
        println(s"($id,$text),--->pron=$prob,prediction=$prediction")
      }
  }