spark版本:2.2.0
数据集…这是练习CF的数据集…直接拿来用了
MovieLens 1M Dataset
文本格式:
users.dat
UserID::Gender::Age::Occupation::Zip-code
movies.dat
MovieID::Title::Genres
ratings.dat
UserID::MovieID::Rating::Timestamp
因为SparkSession对读取文本支持的不是很好,所以采用样例类将rdd转换为df来操作
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
case class Users(UserID:Int,Gender:String,Age:Int,Occupation:Int,Zip_code :String)
case class Movies(MovieID:String,Title:String,Genres:String)
case class Ratings(UserID:Int,MovieID:String,Rating:Double,Timestamp:String)
object Test {
def main(args: Array[String]): Unit = {
val session = SparkSession.builder().master("local[2]").appName(this.getClass.getName).getOrCreate()
val sc = session.sparkContext
import session.implicits._
val movie: RDD[String] = sc.textFile("src/file/ml-1m/movies.dat")
val movies: DataFrame = movie.map(_.split("::")).map(line =>Movies(line(0),line(1),line(2))).toDF
val rate: RDD[String] = sc.textFile("src/file/ml-1m/ratings.dat")
val ratings: DataFrame = rate.map(_.split("::")).map(line=>Ratings(line(0).toInt,line(1),line(2).toDouble,line(3))).toDF
val user: RDD[String] = sc.textFile("src/file/ml-1m/users.dat")
val users: DataFrame = user.map(_.split("::")).map(line=>Users(line(0).toInt,line(1),line(2).toInt,line(3).toInt,line(4))).toDF
// 年龄段在“18-24”的男性年轻人,最喜欢看哪10部电影
val userFilter = users.filter($"Age">=18 and($"Age"<=30) and($"Gender"==="M")).toDF()
ratings.join(userFilter,Seq("UserID")).groupBy($"MovieID").count.sort(-$"count").limit(10)
// 得分最高的10部电影
val ratingFilter: DataFrame = ratings.select($"MovieID",$"Rating")
val ratingAgg: DataFrame = ratingFilter.groupBy("MovieID").agg("Rating"->"count","Rating"->"sum").withColumnRenamed("sum(Rating)","sumRating").withColumnRenamed("count(Rating)","countRating")
val top10: Dataset[Row] = ratingAgg.select($"MovieID",$"sumRating"/$"countRating" as("avgScore")).sort(-$"avgScore").limit(10)
movies.join(top10,Seq("MovieID")).select($"Title").show(10,false)
//看过电影最多的前10个人
ratings.groupBy("UserID").count().sort(-$"count").limit(10).show()
//男性看过最多的10部电影
val man: DataFrame = users.filter($"Gender"==="M").select("UserID")
val manTop10: Dataset[Row] = man.join(ratings,Seq("UserID")).groupBy($"MovieID").agg("UserID"->"count").withColumnRenamed("count(UserID)","manCount").sort(-$"manCount").limit(10)
manTop10.join(movies,Seq("MovieID")).select("Title").show()
}
}