import org.apache.spark.mllib.feature.{HashingTF, IDF}
import org.apache.spark.mllib.linalg.{SparseVector => SV}
import org.apache.spark.{SparkConf, SparkContext}
import scala.io.Source
/**
* Created by xiaojun on 2015/10/19.
*/
object TFIDFDemo {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("TfIdfTest").setMaster("local")
val sc = new SparkContext(conf)
// Load documents (one per line).要求每行作为一个document,这里zipWithIndex将每一行的行号作为doc id
val documents = sc.parallelize(Source.fromFile("CHANGELOG").getLines().filter(_.trim.length > 0).toSeq).map(_.split(" ").toSeq).zipWithIndex()
val hashingTF = new HashingTF(Math.pow(2, 18).toInt)
//这里将每一行的行号作为doc id,每一行的分词结果生成tf词频向量
val tf_num_pairs = documents.map {
<