集体智慧编程中的电影推荐算法主要分两步:
1. 通过影评者对看多的相同电影的评分,计算影评者两两之间的兴趣相似度
2. 根据影评者之间的相似度和对电影的评分,为其他影评者没看过的电影提供推荐指数,推荐指数为 相似度×电影评分
具体细节还是从代码中体会比较好:
package PCI2
import org.apache.spark.sql.SparkSession
object Recomment {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("recomment").master("local[2]").getOrCreate()
import spark.implicits._
val movies = spark.read.textFile("file:///home/xdk/file/movie.txt")
val keyPeople = movies.map(x=>{val sz = x.split("\t"); Tuple2(sz(0), Tuple2(sz(1), sz(2)))}).rdd
val keyMovie = keyPeople.map(x=>(x._2._1, (x._1, x._2._2)))
val joined = keyMovie.join(keyMovie)
val filtered = joined.filter(x=>x._2._1._1 != x._2._2._1)
val peoplePair = filtered.map(x=>{if(x._2._1._1 < x._2._2._1)
((x._2._1._1, x._2._2._1), (x._1, x._2._1._2, x._2._2._2))
else
((x._2._2._1, x._2._1._1), (x._1, x._2._2._2, x._2._1._2))
})
//影评者两两之间的相似度
val sim = peoplePair.distinct().groupByKey().map(x=>(x._1, pearson(x._2)))
// simulator.foreach(println)
val simulator = sim.filter(x=>x._2>0).persist()
val common = filtered.map(x=>((x._2._1._1, x._2._2._1), (x._1, x._2._2._2)))
// common.foreach(println)
val cart = keyPeople.cartesian(keyPeople).filter(x=>x._1._1 != x._2._1)
val hisMovie = cart.map(x=>((x._1._1, x._2._1), x._2._2)).distinct()
val recomm = hisMovie.subtract(common)
val simulator1 = simulator.flatMap(x=>Array(x, ((x._1._2, x._1._1), x._2)))
val joinRes = recomm.join(simulator1)
val recommScore = joinRes.map(x=>((x._1._1, x._2._1._1), (x._2._1._2.toDouble * x._2._2.toDouble, x._2._2.toDouble)))
val recommScoreSum = recommScore.reduceByKey((a, b) => (a._1+b._1, a._2+b._2))
val recommMovies = recommScoreSum.map(x=>(x._1._1, (x._1._2, x._2._1/x._2._2))).groupByKey()
println("======================")
recommMovies.foreach(println)
}
def pearson(arg:Iterable[(String, String, String)]): Double ={
var sumxy = 0.0
var sumx = 0.0
var sumy = 0.0
var sumpowerx = 0.0
var sumpowery = 0.0
for(a<-arg){
val x = a._2.toDouble
val y = a._3.toDouble
sumxy += x*y
sumx += x
sumy += y
sumpowerx += x*x
sumpowery += y*y
}
val n = arg.size
val num = sumxy*n - sumx*sumy
val den = math.sqrt(n*sumpowerx - sumx*sumx)*math.sqrt(n*sumpowery - sumy*sumy)
if (den == 0)
0
else
num/den
}
}
测试数据:
Lisa Rose The Night Listener 3.0
Lisa Rose You, Me and Dupree 2.5
Lisa Rose Lady in the Water 2.5
Lisa Rose Superman Returns 3.5
Lisa Rose Just My Luck 3.0
Lisa Rose Snakes on a Plane 3.5
Michael Phillips Lady in the Water 2.5
Michael Phillips Superman Returns 3.5
Michael Phillips The Night Listener 4.0
Michael Phillips Snakes on a Plane 3.0
Mick LaSalle The Night Listener 3.0
Mick LaSalle You, Me and Dupree 2.0
Mick LaSalle Lady in the Water 3.0
Mick LaSalle Superman Returns 3.0
Mick LaSalle Just My Luck 2.0
Mick LaSalle Snakes on a Plane 4.0
Toby Superman Returns 4.0
Toby You, Me and Dupree 1.0
Toby Snakes on a Plane 4.5
Gene Seymour The Night Listener 3.0
Gene Seymour You, Me and Dupree 3.5
Gene Seymour Lady in the Water 3.0
Gene Seymour Superman Returns 5.0
Gene Seymour Just My Luck 1.5
Gene Seymour Snakes on a Plane 3.5
Jack Matthews Lady in the Water 3.0
Jack Matthews Superman Returns 5.0
Jack Matthews The Night Listener 3.0
Jack Matthews You, Me and Dupree 3.5
Jack Matthews Snakes on a Plane 4.0
Claudia Puig Superman Returns 4.0
Claudia Puig Just My Luck 3.0
Claudia Puig The Night Listener 4.5
Claudia Puig You, Me and Dupree 2.5
Claudia Puig Snakes on a Plane 3.5