基于spark机器学习---------物品推荐
物品推荐
- import org.apache.spark.SparkContext
- import org.apache.spark.mllib.recommendation.{ALS, Rating}
- import org.jblas.DoubleMatrix
- /**
- * Created by LXM55 on 2016/1/26.
- * 物品推荐
- */
- object ItemRecommend {
- def main(args: Array[String]) {
- val sc = new SparkContext("local", "ItemRecommend Test")
- val rawData = sc.textFile("testdata/u.data")
- val rawRatings =rawData.map(_.split("\t").take(3))
- val ratings = rawRatings.map{
- case Array(user,movie,rating) =>
- Rating(user.toInt,movie.toInt,rating.toDouble)
- }
- val model = ALS.train(ratings,50,10,0.01);
- val aMatrix = new DoubleMatrix(Array(1.0,2.0,3.0))
- val itemId = 567
- val itemFactor = model.productFeatures.lookup(itemId).head
- val itemVector = new DoubleMatrix(itemFactor)
- val a = cosineSimilarity(itemVector,itemVector)
- println("aaa------->"+a)
- //求各个物品的余弦相似度
- val sims = model.productFeatures.map {
- case (id, factor) =>
- val factorVector = new DoubleMatrix(factor)
- val sim = cosineSimilarity(factorVector,itemVector)
- (id,sim)
- }
- //取出567最相似的前10个物品
- val K = 10
- val sortedSims = sims.top(K)(Ordering.by[(Int,Double),Double]{
- case (id,similarity) =>similarity})
- println(sortedSims.take(10).mkString("\n"))
- }
- def cosineSimilarity(vec1:DoubleMatrix,vec2:DoubleMatrix):Double = {
- vec1.dot(vec2)/(vec1.norm2()*vec2.norm2())
- }
- }
评价推荐结果:
- package com.bailian.bigdata
- import org.apache.spark.SparkContext
- import org.apache.spark.mllib.recommendation.{ALS, Rating}
- import org.jblas.DoubleMatrix
- /**
- * Created by LXM55 on 2016/1/26.
- * 检查推荐的相似物品
- */
- object CheckItemRecommend {
- def main(args: Array[String]) {
- val sc = new SparkContext("local", "CheckItemRecommend Test")
- val rawData = sc.textFile("testdata/u.data")
- val rawRatings =rawData.map(_.split("\t").take(3))
- val ratings = rawRatings.map{
- case Array(user,movie,rating) =>
- Rating(user.toInt,movie.toInt,rating.toDouble)
- }
- val model = ALS.train(ratings,50,10,0.01);
- val aMatrix = new DoubleMatrix(Array(1.0,2.0,3.0))
- val movies = sc.textFile("testdata/u.item")
- val tittles = movies.map(line => line.split("\\|").take(2))
- .map(array =>(array(0).toInt,array(1)))
- .collectAsMap()
- val itemId = 567
- println("给定的电影名称为: " + tittles(itemId))
- val itemFactor = model.productFeatures.lookup(itemId).head
- val itemVector = new DoubleMatrix(itemFactor)
- cosineSimilarity(itemVector,itemVector)
- //求各个物品的余弦相似度
- val sims = model.productFeatures.map {
- case (id, factor) =>
- val factorVector = new DoubleMatrix(factor)
- val sim = cosineSimilarity(factorVector,itemVector)
- (id,sim)
- }
- val K = 10
- //被推荐那些电影名称及相应相似度
- val sortedSims2 = sims.top(K+1)(Ordering.by[(Int,Double),Double]{
- case (id,similarity) =>similarity
- })
- val result = sortedSims2.slice(1,11).map{
- case (id,sim) =>(tittles(id),sim)
- }.mkString("\n")
- println("被推荐电影为---》 "+result)
- }
- def cosineSimilarity(vec1:DoubleMatrix,vec2:DoubleMatrix):Double = {
- vec1.dot(vec2)/(vec1.norm2()*vec2.norm2())
- }
- }
推荐模型效果的评估:K值平均准确率
- package com.bailian.bigdata
- import org.apache.spark.SparkContext
- import org.apache.spark.mllib.recommendation.{ALS, Rating}
- import org.jblas.DoubleMatrix
- /**
- * Created by LXM55 on 2016/1/26.
- * 推荐模型效果的评估:K值平均准确率
- */
- object MAPK {
- def main(args: Array[String]) {
- val sc = new SparkContext("local", "MAPK Test")
- val rawData = sc.textFile("testdata/u.data")
- val rawRatings = rawData.map(_.split("\t").take(3))
- val ratings = rawRatings.map{
- case Array(user,movie,rating) =>
- Rating(user.toInt,movie.toInt,rating.toDouble)
- }
- val model = ALS.train(ratings,50,10,0.01);
- val moviesForUser = ratings.keyBy(_.user).lookup(789)
- //提取用户实际评价过的电影ID
- val actualMovies = moviesForUser.map(_.product)
- println("actualMovies: " + actualMovies)
- //提取推荐的用户列表,K为10
- val k =10
- val userId = 789
- val topKRecs = model.recommendProducts(userId,k)
- val predictedMovies = topKRecs.map(_.product)
- val predictedUsers = topKRecs.map(_.user)
- println("predictedMovies: " + predictedMovies.mkString("\n"))
- // println("predictedUsers: " + predictedUsers.mkString("\n"))
- //计算平均准确率
- val apk10 = avgPrecisionK(actualMovies,predictedMovies,10)
- println("apk10: "+apk10)
- val itemFactors = model.productFeatures.map{
- case(id,factor) => factor
- }.collect()
- val itemMatrix = new DoubleMatrix(itemFactors)
- println(itemMatrix.rows,itemMatrix.columns)
- val imBroacast = sc.broadcast(itemMatrix)
- println("imBroacast: " + imBroacast)
- //每一个用户ID及各自对应的电影ID构成RDD
- val allRecs = model.userFeatures.map{case (userId,array) =>
- val userVector = new DoubleMatrix(array)
- val scores = imBroacast.value.mmul(userVector)
- val sortedWithId = scores.data.zipWithIndex.sortBy(-_._1)
- val recommendedIds = sortedWithId.map(_._2 + 1).toSeq
- (userId,recommendedIds)
- }
- //获取每一个用户及对应的电影ID
- val userMovies = ratings.map{
- case Rating(user,product,rating) =>
- (user,product)
- }.groupBy(_._1)
- //println("userMovies---->"+userMovies)
- //通过join得到:用户都有一个实际和预测的那些电影ID
- val MAPK = allRecs.join(userMovies).map{case (userId,(predicted,actualWithIds )) =>
- val actual = actualWithIds.map(_._2).toSeq
- avgPrecisionK(actual,predicted,k)
- }.reduce(_ + _) /allRecs.count()
- println("MAPK--------->" + MAPK)
- //下面使用spark MLlib内置的评估函数
- }
- //APK代码实现
- def avgPrecisionK(actual:Seq[Int],predicted:Seq[Int],k:Int):Double = {
- val predK = predicted.take(k)
- var score = 0.0
- var numHits = 0.0
- for((p,i) <- predK.zipWithIndex){
- if(actual.contains(p)){
- numHits += 1.0
- score += numHits/(i.toDouble + 1.0)
- }
- }
- if(actual.isEmpty){
- 1.0
- }else{
- score/math.min(actual.size,k).toDouble
- }
- }
- }
注意:参照spark机器学习这本书的例子实现的 有不足之处请指出