package com.bj.scalacode
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
object RDD_Movie_Users_Analyzer6 {
def main(args: Array[String]): Unit = {
//1.配置SparkConf
val conf = new SparkConf().setMaster("local[*]").setAppName("RDD_Movie_Users_Analyzer6")
//2.创建SparkSession
val spark = SparkSession.builder().config(conf).getOrCreate()
//3.创建SparkSession的SparkContext
val sc = spark.sparkContext
//设置Spark程序运行的日志显示级别为("warn")
sc.setLogLevel("warn")
//4.获取需要用到数据的RDD
val movieRDD = sc.textFile("./data/movies.dat")
val userRDD = sc.textFile("./data/users.dat")
val ratingRDD = sc.textFile("./data/ratings.dat")
//5.获取MovieID、MovieName
val movieinfo = movieRDD.map(m => (m.split("::"))).map(x => (x(0), x(1)))
//获取UserID,MovieID、Rating
val rating = ratingRDD.map(r => (r.split("::"))).map(x => (x(0), x(1), x(2)))
//6.获取总的电影评分和总的点评人数
val movieAndRating = rating.map(m => (m._2, (m._3.toDouble, 1))).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
//7.获取电影ID和电影平均评分
val avgRating = movieAndRating.map(m => (m._1, (m._2._1.toDouble / m._2._2)))
//8.打印出最受欢迎的前12部电影和平均评分
println("打印出最受欢迎的前12部电影和平均评分:")
avgRating.join(movieinfo).map(item => (item._2._1, item._2._2)).sortByKey(false).take(12)
.foreach(record => println(record._2 + " 的平均评分:" + record._1))
//9.所有电影中最受男生喜欢的电影TOP10
val userGender = userRDD.map(u => (u.split("::"))).map(x => (x(0), x(1)))
//取出userID,gender
val genderRating = rating.map(x => (x._1, (x._1, x._2, x._3))).join(userGender).cache() //男生女生对电影的评分
// genderRating.take(10).foreach(println)
//使用join连接 ratings和user之后,分别过滤出男性和女性的记录进行处理
val maleFilterRatings = genderRating.filter(x => x._2._2.equals("M")).map(x => x._2._1)
//男
val femaleFilterRatings = genderRating.filter(x => x._2._2.equals("F")).map(x => x._2._1) //女
println("所有电影中最受男生喜欢的电影TOP10:")
maleFilterRatings.map(x => (x._2, (x._3.toDouble, 1))).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
.map(x => (x._1, x._2._1.toDouble / x._2._2)).join(movieinfo).map(item => (item._2._1, item._2._2))
.sortByKey(false).take(10).foreach(record => println(record._2 + "平均评分:" + record._1))
println("所有电影中最受女生喜欢的电影TOP10:")
femaleFilterRatings.map(x => (x._2, (x._3.toDouble, 1))).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
.map(x => (x._1, (x._2._1.toDouble / x._2._2))).join(movieinfo).map(item => (item._2._1, item._2._2))
.sortByKey(false).take(10).foreach(record => println(record._2 + "平均评分:" + record._1))
//关闭SparkSession
sc.stop()
}
}