MLib 机器学习算法的标准API可以很方便的把多个算法整合到一个pipeline中,并可以把整个过程形象的比如机器学习算法流,Pipeline包括如下三个阶段:
- Tokenier将句子分割成单词
- HashingTF将单词转化为特征向量
- LogisticRegression用特征向量和列别训练模型
def ExamplePipeline(): Unit = {
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.Row
val spark: SparkSession = SparkSession.builder().appName("implicits").master("local[2]").getOrCreate()
// Prepare training documents from a list of (id, text, label) tuples.
val training = spark.createDataFrame(Seq(
(0L, "a b c d e spark", 1.0),
(1L, "b d", 0.0),
(2L, "spark f g h", 1.0),
(3L, "hadoop mapreduce", 0.0)
)).toDF("id", "text", "label")
// 配置管道需要如下三个参数
val tokenizer = new Tokenizer()
.setInputCol("text")
.setOutputCol("words")
val hashingTF = new HashingTF()
.setNumFeatures(1000)
.setInputCol(tokenizer.getOutputCol)
.setOutputCol("features")
val lr = new LogisticRegression()
.setMaxIter(10)
.setRegParam(0.001)
val pipeline = new Pipeline()
.setStages(Array(tokenizer, hashingTF, lr))
//训练管道模型
val model = pipeline.fit(training)
// 将管道模型存储到磁盘
model.write.overwrite().save("/tmp/spark-logistic-regression-model")
// 将半成品管道存储到磁盘
pipeline.write.overwrite().save("/tmp/unfit-lr-model")
// 从磁盘加载管道模型
val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model")
// 用管道模型开始寻来数据集
val test = spark.createDataFrame(Seq(
(4L, "spark i j k"),
(5L, "l m n"),
(6L, "spark hadoop spark"),
(7L, "apache hadoop")
)).toDF("id", "text")
// Make predictions on test documents.
model.transform(test)
.select("id", "text", "probability", "prediction")
.collect()
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
println(s"($id, $text) --> 概率=$prob, 预测值=$prediction")
}
}