打印并求出电影评分最高的前十电影名和平均分
package movies import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object RDD_Movie_User_Analyzer { def main(args: Array[String]): Unit = { //获取SparkSession val spark = SparkSession.builder() .appName("movie") .master("local") .getOrCreate() //获取上下文联系对象 val sc: SparkContext = spark.sparkContext //从HDFS获取数据 val usersRDD = sc.textFile("hdfs://10.0.157.167/data/users.dat") val moviesRDD = sc.textFile("hdfs://10.0.157.167/data/movies.dat") val ratingRDD = sc.textFile("hdfs://10.0.157.167/data/ratings.dat") /** * 思路: * 1.对评论RDD进行切分,取得电影名和评分 * 2.按照电影名进行分组(key,iterator(评分1,评分2,评分。。。)) * 3.对二元组进行重新整合 先将迭代器转list集合,求集合内的元素和和个数,得到平均分 * 4.返回结果(电影名,平均分) * */ val gk: RDD[(String, Iterable[Double])] = ratingRDD.map(_.split("::")).map(x =>(x(1),x(2).toDouble)).groupByKey() val rdd2: RDD[(String, Double)] = gk.map(x => { val movie = x._1 val list: List[Double] = x._2.toList val sum: Double = list.sum val len = list.size (movie, sum / len) }) /** * 对电影数据进行处理 */ val movie = moviesRDD.map(_.split("::")).map(x =>(x(0),x(1))) /** * 电影RDD与结果RDD进行join得到(movieID,(电影名,评分)) * 处理上述处理结果得到三元组 */ val res: RDD[(String, String, Double)] = rdd2.join(movie).map(x => { val id = x._1 val moviename = x._2._2 val rat = x._2._1 (id, moviename, rat) }) /** * 按照评分进行排序取前10 */ val rdd4 = res.sortBy(_._3,false).take(10) //打印结果 rdd4.foreach(x => println("id:" + x._1 + ",电影名:" + x._2 + ",电影评分:" + x._3)) //释放资源 spark.stop() } }