import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.{Row, SparkSession}
object PipelineDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]")
.config("spark.testing.memory", "2147480000").appName("sadas").getOrCreate()
// 从(标识、特征)元祖开始训练数据
val training = spark.createDataFrame(Seq(
(1.0,Vectors.dense(0.0,1.1,0.1)),
(0.0,Vectors.dense(2.0,1.0,-1.0)),
(0.0,Vectors.dense(2.0,1.3,1.0)),
(1.0,Vectors.dense(0.0,1.2,-0.5)))).toDF("label","features")
// 创建logisticRegression实例,是一个估计器
val lr = new LogisticRegression()
// 打印参数,文档和任何默认值
println("LogisticRegression parameters:\n" + lr.explainParams() + "\n")
// 使用setter方法设置参数
lr.setMaxIter(10).setRegParam(0.01)
// 训练lr模型,这里使用了存储在lr的参数
val model1 = lr.fit(training)
// 因为model时模型(即估计器生成的转换器),我们可以查看它在fit()中使用的参数
println("Model1 was fit using parameters:" + model1.parent.extractParamMap())
// 可以使用ParamMap指令参数,它支持几种指定参数的方法
val paramMap = ParamMap(lr.maxIter->20).put(lr.maxIter,30).put(lr.regParam->0.1,lr.threshold->0.55)
// 也可以组合paramMap
val paramMap2 = ParamMap(lr.probabilityCol->"myProbability") //修改输出列名
val paramMapCombined = paramMap ++ paramMap2
// 现在使用paramMapCombined参数学习一个模型
// paramMapCombined覆盖之前通过lr.set*方法设置的所有参数
val model2 = lr.fit(training,paramMapCombined)
println("Model2 was fit using parameters:" + model2.parent.extractParamMap())
// 准备测试数据
val test = spark.createDataFrame(Seq(
(1.0,Vectors.dense(-1.0,1.5,1.3)),
(0.0,Vectors.dense(3.0,2.0,-0.1)),
(1.0,Vectors.dense(0.0,2.2,-1.5))
)).toDF("label","features")
// 使用Transformer.transform()方法对测试数据进行预测
// lr.transform()将仅使用特征列
// 注意:model2.transform()输出一个myProbability列,而不是我们先前通过的probability
model2.transform(test).select("features","label","myProbability","prediction")
.collect().foreach {case Row(features:Vector,label:Double,prob:Vector,prediction:Double)=>
println(s"($features,$label)->prob=$prob,prediction=$prediction")}
}
}
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().master("local[*]").appName("sadas")
.config("spark.testing.memory", "2147480000").getOrCreate()
// Prepare training documents from a list of (id, text, label) tuples.
val training = spark.createDataFrame(Seq(
(0L,"a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")
// Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")
val hashingTF = new HashingTF().setNumFeatures(1000).setInputCol(tokenizer.getOutputCol).setOutputCol("features")
val lr = new LogisticRegression().setMaxIter(10).setRegParam(0.1)
val pipeline = new Pipeline().setStages(Array(tokenizer,hashingTF,lr))
// Fit the pipeline to training documents.
val model = pipeline.fit(training)
// Now we can optionally save the fitted pipeline to disk
model.write.overwrite().save("D:/hadoop/spark/mllearn/model")
// We can also save this unfit pipeline to disk
pipeline.write.overwrite().save("D:/hadoop/spark/mllearn/pipe")
// And load it back in during production
val saveMode = PipelineModel.load("D:/hadoop/spark/mllearn/model")
// Prepare test documents, which are unlabeled (id, text) tuples.
val testData = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L,"l m n"),
(6L, "spark hadoop spark"),
(7L, "apache hadoop")
)).toDF("id","text")
// Make predictions on test documents.
model.transform(testData).select("id", "text", "probability", "prediction").collect()
.foreach {case Row(id:Long,text:String, prob:Vector,prediction:Double)=>
println(s"($id,$text),--->pron=$prob,prediction=$prediction")
}
}