题目:
(1)
var sql =
"""
|select shop_name,count(distinct user_id)
|from t_shop
|group by shop_name
|""".stripMargin
(2)
/*var sql =
"""
|select shop_name,user_id,count(*)
|from t_shop
|group by shop_name,user_id
|//as ct
|""".stripMargin*/
//2).查询每个店铺被用户访问的次数排名 表t2
/* var sql =
"""
|select shop_name,user_id,
|rank() over(partition by shop_name order by ct) rk
|from t1
|as t2
|""".stripMargin*/
//3).取得每个店铺的前三名
/* var sql =
"""
|select shop_name,user_id,ct
|from t2
|where rk<=3
|""".stripMargin
*/
//4).整合
var sql =
"""
|select shop_name,user_id,ct
|from(
| select shop_name,user_id,ct,rank() over(partition by shop_name order by ct) rk
| from
| (
| select shop_name,user_id,count(*) ct
| from t_shop
| group by shop_name,user_id
| )t1
| )t2
|where rk<=3
|""".stripMargin
val result: DataFrame = spark.sql(sql)
result.show()
完整作业代码
object ShopApplication {
def main(args: Array[String]): Unit = {
/*1.创建*/
val spark: SparkSession = SparkSession
.builder()
.config("spark.app.name", "Shop")
.config("spark.master", "local[*]")
.getOrCreate()
/*2.导入数据*/
val rdd: RDD[String] = spark.sparkContext.textFile("hdfs://hadoop10:9000/jd_visit.log")
/*读取数据,获取df对象*/
import spark.implicits._
val df1: DataFrame = rdd.map(_.split("\\s+"))
.map(array => (array(0), array(1))).toDF("user_id", "shop_name")
df1.createTempView("t_shop")
//需求1:每个店铺的UV(访客数)
/* var sql =
"""
|select shop_name,count(distinct user_id)
|from t_shop
|group by shop_name
|""".stripMargin*/
//需求2:每个店铺访问次数top3的访客信息。输出店铺名称、访客id、访问次数
//1).查询每个店铺被访问的次数 表ct
/*var sql =
"""
|select shop_name,user_id,count(*)
|from t_shop
|group by shop_name,user_id
|//as ct
|""".stripMargin*/
//2).查询每个店铺被用户访问的次数排名 表t2
/* var sql =
"""
|select shop_name,user_id,
|rank() over(partition by shop_name order by ct) rk
|from t1
|as t2
|""".stripMargin*/
//3).取得每个店铺的前三名
/* var sql =
"""
|select shop_name,user_id,ct
|from t2
|where rk<=3
|""".stripMargin
*/
//4).整合
var sql =
"""
|select shop_name,user_id,ct
|from(
| select shop_name,user_id,ct,rank() over(partition by shop_name order by ct) rk
| from
| (
| select shop_name,user_id,count(*) ct
| from t_shop
| group by shop_name,user_id
| )t1
| )t2
|where rk<=3
|""".stripMargin
val result: DataFrame = spark.sql(sql)
result.show()
//释放资源
spark.stop()
}
}