TextRank代码实现
package program
import algorithm.TextRank
import org.apache.spark.sql.SaveMode
import util.{SegmentWordUtil, SparkSessionBase}
import scala.collection.mutable
import scala.collection.mutable.ListBuffer
object ComputeTextRank {
def main(args: Array[String]): Unit = {
//通过SparkSessionBase创建Spark会话
val session = SparkSessionBase.createSparkSession()
session.sql("use program")
//获取节目信息,然后对其进行分词
// val articleDF = session.sql("select * from item_info limit 20")
val articleDF = session.table("item_info").limit(1000)
val seg = new SegmentWordUtil()
//[itemID,[w1,w2,w3......]]
val wordsRDD = articleDF.rdd.mapPartitions(seg.segeFun)
//计算每个节目 每个单词的TR值
val tralgm = new TextRank()
//transform构建图
val transformGraphRDD = wordsRDD.map(x => (x._1, tralgm.transform(x._2)))
//[itemid,map[word,tr]]
val rankRDD = transformGraphRDD.map(x => (x._1, tralgm.rank(x._2)))
// rankRDD.foreach(println)
/**
* 将每个节目 每个单词的TR值与对应的单词的IDF相乘
* (1)创建广播变量 将idf_keywords(word idf)表数据 作为广播变量
* (2)遍历sortByRankRDD 匹配单词,TR*IDF
*/
val word2IDFMap = mutable.Map[String, Double]()
session.table("tmp_program.keyword_idf").rdd.collect().foreach(row => {
word2IDFMap += ((row.getAs[String]("word"), row.getAs[Double]("idf")))
})
val word2IDFBroad = session.sparkContext.broadcast(word2IDFMap)
//将每篇文章中每个单词的tr*对用的IDF值 作为筛选关键词的依据
val keyWordsWithWeightsRDD = rankRDD.map(data => {
val itemID = data._1
val word2TR = data._2
val word2IDFMap = word2IDFBroad.value
val list = new ListBuffer[(Long, String, Double)]
//itemID [word1:组合权重1,word2:组合权重2 .....]
val word2Weights = word2TR.map(t => {
val word = t._1
val tr = t._2
var weights = 0d
if (word2IDFMap.contains(word)) {
//word2IDFMap(word) idf值
weights = word2IDFMap(word) * tr
} else {
weights = tr
}
//单词对应的组合权重
(word, weights)
})
(itemID, word2Weights)
})
//根据混合的weight值排序,选择topK个单词
val sortByWeightRDD = keyWordsWithWeightsRDD
// .filter(_._2.size > 10)
.map(x => (x._1, sortByWeights(x._2)))
.flatMap(explode)
//keyWordsWithWeightsRDD转成DF
import session.implicits._
val word2WeightsDF = sortByWeightRDD.toDF("item_id", "word", "weight")
session.sql("use tmp_program")
word2WeightsDF.write.mode(SaveMode.Overwrite).saveAsTable("keyword_tr")
/**
* create table keyword_tr(
* item_id Int comment "index",
* word STRING comment "word",
* tr Double comment "idf"
* )
* COMMENT "keyword_tr"
* row format delimited fields terminated by ','
* LOCATION '/user/hive/warehouse/tmp_program.db/keyword_tr';
*/
session.close()
}
def explode(data: (Long, Map[String, Double])) = {
val itemID = data._1
val ds = data._2
val list = new ListBuffer[(Long, String, Double)]
for (elem <- ds) {
list += ((itemID, elem._1, elem._2))
}
list.iterator
}
def sortByWeights(doc: mutable.HashMap[String, Double]) = {
val mapDoc = doc.toSeq
val reverse = mapDoc.sortBy(-_._2).take(10).toMap
reverse
}
}
PageRank代码实现
package util
import org.apache.spark.sql.SparkSession
/**
* spark Π值
*/
object SparkPageRank {
def main(args: Array[String]) {
val spark = SparkSession
.builder
.appName("SparkPageRank")
.master("local")
.getOrCreate()
//迭代几次
val iters = if (args.length > 1) args(1).toInt else 10
//KV格式
val lines = spark.sparkContext.parallelize(List(
//A指向B
("A", "B"),
("A", "C"),
("B", "A"),
("B", "C"),
("C", "A"),
("C", "B"),
("C", "D"),
("D", "C")
))
/**
* A [B,C]
* B [A,C]
* ...
*/
val links = lines.groupByKey().cache()
//将value 全部置为 1 初始PR:1
/**
* A 1
* B 1
* ...
*/
var ranks = links.mapValues(v => 1.0)
for (i <- 1 to iters) {
val contribs = links.join(ranks).values.flatMap { case (urls, rank) =>
val size = urls.size
urls.map(url => (url, rank / size))
}
ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
}
val output = ranks.collect()
output.foreach(tup => println(s"${tup._1} has rank: ${tup._2}"))
spark.stop()
}
}
TF-IDF算法实现
package program
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel, IDF}
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.SaveMode
import util.{SegmentWordUtil, SparkSessionBase}
import scala.collection.mutable.ListBuffer
/**
* 对节目信息分词,然后求TF-IDF值
* hive --service metastore 2>&1 >> /opt/meta.log &
*/
object ComputeTFIDF {
def main(args: Array[String]): Unit = {
//通过SparkSessionBase创建Spark会话
val session = SparkSessionBase.createSparkSession()
import session.implicits._
/**
* 查询hive哪一个数据库有两种方式:
* 1、sql("use database")
* 2、sql(select * from program.item_info)
*/
session.sql("use program")
//获取节目信息,然后对其进行分词
val articleDF = session.sql("select * from item_info limit 100")
// val articleDF = session.table("item_info")
//分词
val seg = new SegmentWordUtil()
val words_df = articleDF.rdd.mapPartitions(seg.segeFun).toDF("item_id", "words")
words_df.show(false)
//创建CountVectorizer对象,统计所有影响的词,形成词袋
val countVectorizer = new CountVectorizer()
countVectorizer.setInputCol("words")
countVectorizer.setOutputCol("features")
countVectorizer.setVocabSize(1000)
//词必须出现在至少一篇文章中 如果是一个0-1的数字,则代表概率
countVectorizer.setMinDF(1.0)
//训练词袋模型
var cvModel = countVectorizer.fit(words_df)
// //保存词袋模型到hdfs上
// cvModel.write.overwrite().save("hdfs://node01:9000/recommond_program/models/CV.model")
//
// //通过spark sql读取模型内容
// session.read.parquet("hdfs://node01:9000/recommond_program/models/CV.model/data/*").show()
// //这是所有的词
// cvModel.vocabulary.foreach(println)
//基于词袋模型做每篇文章的单词统计 TF
val cv_result = cvModel.transform(words_df)
cv_result.show(10,false)
/**
* 创建IDF对象
* 基于单词统计的结果(cv_result--> TF)计算各个单词的idf值
*/
val idf = new IDF()
idf.setInputCol("features")
idf.setOutputCol("features_tfidf")
//计算每个词的逆文档频率
val idfModel = idf.fit(cv_result)
idfModel.write.overwrite().save("hdfs://node01:9000/recommond_program/models/IDF.model")
/**
* tf:w1:10 w1:100
*
* idf基于整个语料库计算出来的
* word : idf值
*/
session.read.parquet("hdfs://node01:9000/recommond_program/models/IDF.model/data").show(10,false)
/**
* 将每个单词对应的IDF(逆文档频率) 保存在Hive表中
*/
//整理数据格式(index,word,IDF)
val keywordsWithIDFList = new ListBuffer[(Int, String, Double)]
val words = cvModel.vocabulary
val idfs = idfModel.idf.toArray
for (index <- 0 until (words.length)) {
keywordsWithIDFList += ((index, words(index), idfs(index)))
}
//保存数据
session.sql("use tmp_program")
session
.sparkContext
.parallelize(keywordsWithIDFList)
.toDF("index", "keywords", "idf")
.write
.mode(SaveMode.Overwrite)
.insertInto("keyword_idf")
/**
* idfModel 和 TF 计算TF-IDF
*
* TF
* idf
*/
val tfIdfResult = idfModel.transform(cv_result)
tfIdfResult.show()
//根据TFIDF来排序
val keyword2TFIDF = tfIdfResult.rdd.mapPartitions(partition => {
val rest = new ListBuffer[(Long, Int, Double)]
val topN = 10
while (partition.hasNext) {
val row = partition.next()
var idfVals: List[Double] = row.getAs[SparseVector]("features_tfidf").values.toList
val tmpList = new ListBuffer[(Int, Double)]
for (i <- 0 until (idfVals.length))
tmpList += ((i, idfVals(i)))
val buffer = tmpList.sortBy(_._2).reverse
for (item <- buffer.take(topN))
rest += ((row.getAs[Long]("item_id"), item._1, item._2))
}
rest.iterator
}).toDF("item_id", "index", "tfidf")
keyword2TFIDF.show(10)
keyword2TFIDF.createGlobalTempView("keywordsByTable")
//获取索引对应的单词,组织格式 保存Hive表
session.sql("select * from keyword_idf a join global_temp.keywordsByTable b on a.index = b.index")
.select("item_id", "word", "tfidf")
.write
.mode(SaveMode.Overwrite)
.insertInto("keyword_tfidf")
session.close()
}
}