目录
目的
根据电影标签进行相似推荐。
算法设计
基于UGC的推荐+IF-IDF优化。
算法实现
将标签内容进行提取,得到电影的内容特征向量,进而可以通过求取相似度矩阵;
为了避免热门标签对特征提取的影响,通过 TF-IDF 算法对标签的权重进行调整,从而尽可能地接近用户偏好;
核心代码
val tokenizer = new Tokenizer().setInputCol("genres").setOutputCol("words")
val wordsData: DataFrame = tokenizer.transform(movieTagsDF)
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50)
val featurizedData: DataFrame = hashingTF.transform(wordsData)
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData: DataFrame = idfModel.transform(featurizedData)
val movieFeatures = rescaledData.map(
row => (row.getAs[Int]("mid"), row.getAs[SparseVector]("features").toArray)
)
.rdd
.map(
x => (x._1, new DoubleMatrix(x._2))
)
movieFeatures.collect().foreach(println)
val movieRecs = movieFeatures.cartesian(movieFeatures)
.filter {
case (a, b) => a._1 != b._1
}
.map {
case (a, b) => {
val simScore = this.consinSim(a._2, b._2)
(a._1, (b._1, simScore))
}
}
.filter(_._2._2 > 0.6)
.groupByKey()
.map {
case (mid, items) => MovieRecs(mid, items.toList.sortWith(_._2 > _._2).map(x => Recommendation(x._1, x._2)))
}
.toDF()
完整代码
import org.apache.spark.SparkConf
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.ml.linalg.SparseVector
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.jblas.DoubleMatrix
//数据源为 电影内容信息
case class Movie(mid: Int, name: String, descri: String, timelong: String, issue: String,
shoot: String, language: String, genres: String, actors: String, directors: String)
//跟sparkMl 里面区分
case class MovieRating(uid: Int, mid: Int, score: Double, timestamp: Int)
case class MongoConfig(uri: String, db: String)
//基准推荐对象
case class Recommendation(mid: Int, score: Double)
//基于 电影内容信息 电影特征向量的电影相似度列表
case class MovieRecs( mid: Int, recs: Seq[Recommendation] )
object ContentRecommender {
val MONGODB_MOVIE_COLLECTION = "Movie"
val CONTENT_MOVIE_RECS = "ContentMovieRecs"
def main(args: Array[String]): Unit = {
val config = Map(
"spark.cores" -> "local[*]",
"mongo.uri" -> "mongodb://hadoop100:27017/recommender",
"mongo.db" -> "recommender"
)
val sparkConf = new SparkConf().setMaster(config("spark.cores")).setAppName("ContentRecommender")
val spark = SparkSession.builder().config(sparkConf).getOrCreate()
import spark.implicits._
implicit val mongoConfig = MongoConfig(config("mongo.uri"), config("mongo.db"))
// 加载数据 预处理
val movieTagsDF = spark.read
.option("uri", mongoConfig.uri)
.option("collection", MONGODB_MOVIE_COLLECTION)
.format("com.mongodb.spark.sql")
.load()
.as[Movie]
.map(
// 提取mid,name,genres三项作为原始内容特征,分词器默认按照空格做分词
x => (x.mid, x.name, x.genres.map(c => if (c == '|') ' ' else c))
)
.toDF("mid", "name", "genres")
.cache()
//设置分词器,输入一列genres 输出一列words
val tokenizer = new Tokenizer().setInputCol("genres").setOutputCol("words")
//用分词器对原始数据进行预处理,生成新的一列words
val wordsData: DataFrame = tokenizer.transform(movieTagsDF)
// wordsData.show(truncate = false) 输出结果不压缩
// +----+--------------------+--------------------+--------------------+
// | mid | name | genres | words |
// +----+--------------------+--------------------+--------------------+
// Drama| [drama]|
// 引入HashingTF工具 一个词语 => 吧一个词语序列转换成对应的词频率
val hashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(50)
val featurizedData: DataFrame = hashingTF.transform(wordsData)
// 生成稀疏向量的数据结构
// 引入IDF工具,可以得到idf模型
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
// 训练idf模型,得到每个词的逆文档频率
val idfModel = idf.fit(featurizedData)
// 用模型对原数据进行处理,得到文档中每个词的tf-idf,作为新的特征向量
val rescaledData: DataFrame = idfModel.transform(featurizedData)
//将稀疏矩阵 => DoubleMatrix 浮点向量
val movieFeatures = rescaledData.map(
row => (row.getAs[Int]("mid"), row.getAs[SparseVector]("features").toArray)
)
.rdd
.map(
x => (x._1, new DoubleMatrix(x._2))
)
movieFeatures.collect().foreach(println)
// 对所有电影两两计算它们的相似度,先做笛卡尔积
val movieRecs = movieFeatures.cartesian(movieFeatures)
.filter {
// 把自己跟自己的配对过滤掉
case (a, b) => a._1 != b._1
}
.map {
case (a, b) => {
val simScore = this.consinSim(a._2, b._2)
(a._1, (b._1, simScore))
}
}
.filter(_._2._2 > 0.6) // 过滤出相似度大于0.6的
.groupByKey()
.map {
case (mid, items) => MovieRecs(mid, items.toList.sortWith(_._2 > _._2).map(x => Recommendation(x._1, x._2)))
}
.toDF()
movieRecs.write
.option("uri", mongoConfig.uri)
.option("collection", CONTENT_MOVIE_RECS)
.mode("overwrite")
.format("com.mongodb.spark.sql")
.save()
spark.stop()
}
// 求向量余弦相似度
def consinSim(movie1: DoubleMatrix, movie2: DoubleMatrix): Double = {
movie1.dot(movie2) / (movie1.norm2() * movie2.norm2())
}
}