学习致谢
https://www.bilibili.com/video/BV1Xz4y1m7cv?p=63
需求:
对电影评分数据进行统计分析,分别使用DSL编程和5QL编程,获取电影平均分Top10,要求电影的评分次数大于200
代码实现
package sql
import java.util.Properties
import org.apache.spark.sql.{DataFrame, Dataset, SaveMode, SparkSession}
/**
* Author itcast
* Desc 演示使用spark-SQL-实现电影数据分析Top10
* */
object Demo07_MovieDataAnalysis{
def main(args: Array[String]): Unit = {
//TODO 0.准备环境
val spark = SparkSession.builder().appName("sparksql").master("local[*]")
.config("spark.sql.shuffle.partitions","4").getOrCreate()//本次测试时分区数设置小一点,实际开发中可以根据集群规模设置大小
val sc = spark.sparkContext
sc.setLogLevel("WARN")
import spark.implicits._
//TODO 1.加载数据
val ds: Dataset[String] = spark.read.textFile("data/input/input/rating_100k.data")
//TODO 2.处理数据
val movieDS: DataFrame = ds.map(line => {
val arr: Array[String] = line.split("\t")
(arr(1), arr(2).toInt)
}).toDF("movieId", "score")
movieDS.printSchema()
movieDS.show()
//需求:统计评分次数》200的电影平均分top10
//TODO=======SQL
//注册表
movieDS.createOrReplaceTempView("t_movies")
val sql:String=
"""
|selet movieId,avg(score) as avgscore,count(*) as counts
|from t_movies
|group by movieId
|having counts >200
|order by avgscore desc
|limit 10
|""".stripMargin
spark.sql(sql).show()
//TODO=======DSL
import org.apache.spark.sql.functions._
movieDS.groupBy('movieId)
.agg(
avg('score) as "avgscore",
count("movieId") as "counts"
).filter('counts>200)
.orderBy('avgscore.desc)
.limit(10)
//TODO 3.输出数据
//TODO 4.关闭资源
spark.close()
}
}