http://spark.apache.org/docs/latest/ml-features.html#tf-idf
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.SparkSession
import scala.collection.mutable
import scala.io.Source
/**
* Created by xubc on 2017/6/3.
*/
object TestX {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder
.master("local[5]")
.appName(this.getClass.getName().stripSuffix("$"))
.getOrCreate()
val sentenceData = spark.createDataFrame(Seq(
(0.0, "Hi I heard about are Spark"),
(1.0, "I wish Java could use case spark classes"),
(2.0, "Logistic regression regression models are neat I")
)).toDF("label", "sentence")
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
val wordsData = tokenizer.transform(sentenceData)
// HashingTF bow模型
// val hashingTF = new HashingTF()
// .setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(100)
// val featurizedData = hashingTF.transform(wordsData)
// CountVectorizer bow模型
val cvModel: CountVectorizerModel = new CountVectorizer()
.setInputCol("words").setOutputCol("rawFeatures")
.fit(wordsData)
val featurizedData = cvModel.transform(wordsData)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.printSchema()
val vocabulary = cvModel.vocabulary
println(vocabulary.mkString(","))
rescaledData.show(false)
rescaledData.foreach(e => {
val label = e.getAs[Double]("label")
val str = e.getAs[String]("sentence")
val words = e.getAs[mutable.WrappedArray[String]]("words").mkString(",")
val tf = e.getAs[SparseVector]("rawFeatures")
val originWords = tf.indices.map(i => vocabulary(i)).mkString(",")
val idf = e.getAs[SparseVector]("features")
println(
s"""$label $str
| $words
| $tf $originWords
| $idf""".stripMargin)
})
}
}