spark-core 分析电影数据
数据下载地址
http://files.grouplens.org/datasets/movielens/ml-1m.zip
数据说明
- users.dat
UserID::Gender::Age:Occupation:Zip-code
用户号::性别:年龄:职业:邮政编码
- movies.dat
MovieID::Title::Genres
电影号::标题::流派
- ratings.dat
UerID::MoviesID::Rating::Timestamp
UerID:: MoviesID::评级::时间戳
指定电影的观众年龄和性别分布
- 获取用户数据,拿出需要的信息
- 获取评级数据,对电影过滤,join用户数据。
- 对sex,age做聚合,得出分布。
package com.caiw.movie
import org.apache.spark._
import org.apache.spark.sql.SparkSession
object SparkWithMov {
def main(args: Array[String]) {
val ss = SparkSession.builder().appName("movieAnalysis").master("local").getOrCreate()
val sc = ss.sparkContext
//1.加载数据
val MOVIE_ID = "999"
val usersRdd = sc.textFile("spark/src/main/resources/ml-1m/users.dat")
val ratingsRdd = sc.textFile("spark/src/main/resources/ml-1m/ratings.dat")
//2.获取用户ID,sex,age
val users = usersRdd.map(_.split("::")).map { x =>
(x(0), (x(1), x(2)))
}
//3.拿到指定movie的用户,并对用户的age和sex进行聚合
val userMov = ratingsRdd
.map(_.split("::"))
.map { x =>(x(0), x(1))}
.filter(_._2.equals(MOVIE_ID))
.join(users)
.map { x =>(x._2._2, 1)}
.reduceByKey(_ + _)
//5.输出结果
userMov
.collect
.foreach(x => println("性别:"+x._1._1+",年龄:"+x._1._2+",观看人数:"+x._2))
ss.stop()
}
}
热门电影top10(观众年龄18-27)
- 获取用户数据,排除范围外的用户
- 用户数据join评级数据拿到电影ID
- 通过电影ID拿到电影信息,通过Name聚合并取出top10
package com.caiw.movie
import org.apache.spark.sql.SparkSession
object SparkWithMov02 {
def main(args: Array[String]): Unit = {
val ss = SparkSession.builder().master("local[4]").appName("top10").getOrCreate()
//获取年龄在18-24之间的观众
val users = ss.read
.text("spark/src/main/resources/ml-1m/users.dat")
.rdd
.map {
row =>
val strs = row.get(0).toString.split("::")
strs(0) -> strs(2)
}
.filter(18 <= _._2.toInt)
.filter(_._2.toInt <= 27)
//通过观众ID 找到观众看过的movieID
val movieId = ss.read
.text("spark/src/main/resources/ml-1m/ratings.dat")
.rdd
.map{
row =>
val strs = row.get(0).toString.split("::")
strs(0)->strs(1)
}
.join(users)
.map(x => x._2)
//通过movie拿到电影名字,根据电影名聚合拿到观看次数并取出前10
ss.read
.text("spark/src/main/resources/ml-1m/movies.dat")
.rdd
.map{
row =>
val strs = row.get(0).toString.split("::")
strs(0) -> strs(1)
}
.join(movieId)
.map(xx => (xx._1,1))
.reduceByKey(_+_)
.sortBy(_._2,ascending = false)
.take(10)
.foreach(cc => println(cc._1+"\t"+cc._2))
ss.stop()
}
}