前面已经进行了分词。本文介绍如何进行推荐,核心代码如下:
private val rdd: RDD[Topic_desc] = spark.sparkContext.textFile(SOURCE_URL).map {
line =>
val strings = line.split("\\$,")
val idstring = strings(0).split("\\(")
Topic_desc(idstring(1).toInt, strings(1))
}
private val pframe: DataFrame = rdd.toDF()
//过滤掉空值
pframe.registerTempTable("filter")
private val filterFrame: DataFrame = spark.sqlContext.sql("select id,desc from filter where desc!=')'")
//得到需要的文本
private val dframerdd: RDD[Topic_desc_result] = filterFrame.rdd.map {
line =>
val str1: Array[String] = line.getAs("desc").toString.split("\\)")
val descArray: Array[String] = str1(0).split(",")
Topic_desc_result(line.getAs("id"), descArray)
}
//转化成word2Vec
private val dframe: DataFrame = dframerdd.toDF()
private val word2Vec: Word2Vec