公司代码,年度,1月-------------------------12月的收入金额 burk,year,tsl01,tsl02,tsl03,tsl04,tsl05,tsl06,tsl07,tsl08,tsl09,tsl10,tsl11,tsl12 853101,2010,100200,25002,19440,20550,14990,17227,40990,28778,19088,29889,10990,20990 853101,2011,19446,20556,14996,17233,40996,28784,19094,28779,19089,29890,10991,20991 853101,2012,19447,20557,14997,17234,20560,15000,17237,28780,19090,29891,10992,20992 853101,2013,20560,15000,17237,41000,17234,20560,15000,17237,41000,29892,10993,20993 853101,2014,19449,20559,14999,17236,41000,28788,28786,19096,29897,41000,28788,20994 853101,2015,100205,25007,19445,20555,17236,40999,28787,19097,29898,29894,10995,20995 853101,2016,100206,25008,19446,20556,17237,41000,28788,19098,29899,29895,10996,20996 853101,2017,100207,25009,17234,20560,15000,17237,41000,15000,17237,41000,28788,20997 853101,2018,100208,25010,41000,28788,28786,19096,29897,28786,19096,29897,10998,20998 853101,2019,100209,25011,17236,40999,28787,19097,29898,28787,19097,29898,10999,20999 846271,2010,100210,25012,17237,41000,28788,19098,29899,28788,19098,29899,11000,21000 846271,2011,100211,25013,19451,20561,15001,17238,41001,28789,19099,29900,11001,21001 846271,2012,100212,100213,20190,6484,46495,86506,126518,166529,206540,246551,286562,326573 846271,2013,100213,100214,21297,5008,44466,83924,123382,162839,202297,241755,281213,320671 846271,2014,100214,100215,22405,3531,42436,81341,120245,159150,198055,236959,275864,314769 846271,2015,100215,100216,23512,2055,19096,29897,28786,19096,29897,41000,29892,308866 846271,2016,100216,100217,24620,579,38377,76175,28788,28786,19096,29897,41000,302964 846271,2017,100217,100218,25727,898,36347,73592,40999,28787,19097,29898,29894,297062 846271,2018,100218,100219,26835,2374,34318,71009,41000,28788,19098,29899,29895,291159 846271,2019,100219,100220,27942,3850,32288,68427,17237,41000,15000,17237,41000,285257 1、统计每个公司每年按月累计收入 行转列 --> sum窗口函数 输出结果 公司代码,年度,月份,当月收入,累计收入 2、统计每个公司当月比上年同期增长率 行转列 --> lag窗口函数 公司代码,年度,月度,增长率(当月收入/上年当月收入 - 1)
package com.shujia.sql
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{Column, DataFrame, SparkSession}
object Demo07Burks {
def main(args: Array[String]): Unit = {
//创建SparkSession
val spark: SparkSession = SparkSession
.builder()
.appName("Demo07Burks")
.master("local")
.config("spark.sql.shuffle.partitions",2)//默认200
.getOrCreate()
//导入function以及隐式转换
import org.apache.spark.sql.functions._
import spark.implicits._
//读取数据
val burksDF: DataFrame = spark
.read
.format("csv")
.option("sep", ",")
.schema("burk String,year String,tsl01 String,tsl02 String,tsl03 String,tsl04 String,tsl05 String,tsl06 String,tsl07 String,tsl08 String,tsl09 String,tsl10 String,tsl11 String,tsl12 String")
.load("Spark/data/burks.txt")
burksDF.cache()
burksDF.createOrReplaceTempView("burks")
burksDF.show()
//1、统计每个公司每年按月累计收入
//SQL的方式
spark
.sql(
"""
|SELECT t1.burk
| ,t1.year
| ,t1.month
| ,t1.amount
| ,sum(t1.amount) over(partition by t1.burk,t1.year order by t1.month) as sum_amount
|from(
| SELECT burk
| ,year
| ,month
| ,amount
| from burks lateral view explode(map(1,tsl01,2,tsl02,3,tsl03,4,tsl04,5,tsl05,6,tsl06,7,tsl07,8,tsl08,9,tsl09,10,tsl10,11,tsl11,12,tsl12)) v1 as month,amount
|) t1
|""".stripMargin)
.show(50)
val col: Column = map(
expr("1"), $"tsl01"
, expr("2"), $"tsl02"
, expr("3"), $"tsl03"
, expr("4"), $"tsl04"
, expr("5"), $"tsl05"
, expr("6"), $"tsl06"
, expr("7"), $"tsl07"
, expr("8"), $"tsl08"
, expr("9"), $"tsl09"
, expr("10"), $"tsl10"
, expr("11"), $"tsl11"
, expr("12"), $"tsl12"
)
// DSL的方式
burksDF
.select($"burk"
, $"year"
, explode(col) as Array("month", "amount"))
.select($"burk", $"year", $"month", $"amount",
sum($"amount") over Window.partitionBy($"burk", $"year").orderBy($"month") as "sum_amount")
.show()
// 2、统计每个公司当月比上年同期增长率
// SQL的方式
spark.sql(
"""
|select tt1.burk
| ,tt1.year
| ,tt1.month
| ,tt1.amount
| ,tt1.last_amount
| ,round(tt1.amount / tt1.last_amount,8) -1 as incr
|from (
| SELECT t1.burk
| ,t1.year
| ,t1.month
| ,t1.amount
| ,lag(t1.amount,1) over(partition by burk,month order by year ) as last_amount
| from (
| SELECT burk
| ,year
| ,month
| ,amount
| from burks lateral view explode(map(1,tsl01,2,tsl02,3,tsl03,4,tsl04,5,tsl05,6,tsl06,7,tsl07,8,tsl08,9,tsl09,10,tsl10,11,tsl11,12,tsl12)) v1 as month,amount
| ) t1
|) tt1
|""".stripMargin)
.show()
// DSL
burksDF
.select($"burk"
, $"year"
, explode(col) as Array("month", "amount"))
.select($"burk", $"year", $"month", $"amount",
lag($"amount", 1) over Window.partitionBy($"burk", $"month").orderBy($"year") as "last_amount")
.select($"burk", $"year", $"month", $"amount", $"last_amount", round($"amount" / $"last_amount", 8) - 1 as "incr")
.show()
}
}
+------+----+------+------+-----+-----+-----+-----+------+------+------+------+------+------+
| burk|year| tsl01| tsl02|tsl03|tsl04|tsl05|tsl06| tsl07| tsl08| tsl09| tsl10| tsl11| tsl12|
+------+----+------+------+-----+-----+-----+-----+------+------+------+------+------+------+
|853101|2010|100200| 25002|19440|20550|14990|17227| 40990| 28778| 19088| 29889| 10990| 20990|
|853101|2011| 19446| 20556|14996|17233|40996|28784| 19094| 28779| 19089| 29890| 10991| 20991|
|853101|2012| 19447| 20557|14997|17234|20560|15000| 17237| 28780| 19090| 29891| 10992| 20992|
|853101|2013| 20560| 15000|17237|41000|17234|20560| 15000| 17237| 41000| 29892| 10993| 20993|
|853101|2014| 19449| 20559|14999|17236|41000|28788| 28786| 19096| 29897| 41000| 28788| 20994|
|853101|2015|100205| 25007|19445|20555|17236|40999| 28787| 19097| 29898| 29894| 10995| 20995|
|853101|2016|100206| 25008|19446|20556|17237|41000| 28788| 19098| 29899| 29895| 10996| 20996|
|853101|2017|100207| 25009|17234|20560|15000|17237| 41000| 15000| 17237| 41000| 28788| 20997|
|853101|2018|100208| 25010|41000|28788|28786|19096| 29897| 28786| 19096| 29897| 10998| 20998|
|853101|2019|100209| 25011|17236|40999|28787|19097| 29898| 28787| 19097| 29898| 10999| 20999|
|846271|2010|100210| 25012|17237|41000|28788|19098| 29899| 28788| 19098| 29899| 11000| 21000|
|846271|2011|100211| 25013|19451|20561|15001|17238| 41001| 28789| 19099| 29900| 11001| 21001|
|846271|2012|100212|100213|20190| 6484|46495|86506|126518|166529|206540|246551|286562|326573|
|846271|2013|100213|100214|21297| 5008|44466|83924|123382|162839|202297|241755|281213|320671|
|846271|2014|100214|100215|22405| 3531|42436|81341|120245|159150|198055|236959|275864|314769|
|846271|2015|100215|100216|23512| 2055|19096|29897| 28786| 19096| 29897| 41000| 29892|308866|
|846271|2016|100216|100217|24620| 579|38377|76175| 28788| 28786| 19096| 29897| 41000|302964|
|846271|2017|100217|100218|25727| 898|36347|73592| 40999| 28787| 19097| 29898| 29894|297062|
|846271|2018|100218|100219|26835| 2374|34318|71009| 41000| 28788| 19098| 29899| 29895|291159|
|846271|2019|100219|100220|27942| 3850|32288|68427| 17237| 41000| 15000| 17237| 41000|285257|
+------+----+------+------+-----+-----+-----+-----+------+------+------+------+------+------+
+------+----+-----+------+----------+
| burk|year|month|amount|sum_amount|
+------+----+-----+------+----------+
|846271|2012| 1|100212| 100212.0|
|846271|2012| 2|100213| 200425.0|
|846271|2012| 3| 20190| 220615.0|
|846271|2012| 4| 6484| 227099.0|
|846271|2012| 5| 46495| 273594.0|
|846271|2012| 6| 86506| 360100.0|
|846271|2012| 7|126518| 486618.0|
|846271|2012| 8|166529| 653147.0|
|846271|2012| 9|206540| 859687.0|
|846271|2012| 10|246551| 1106238.0|
|846271|2012| 11|286562| 1392800.0|
|846271|2012| 12|326573| 1719373.0|
|846271|2014| 1|100214| 100214.0|
|846271|2014| 2|100215| 200429.0|
|846271|2014| 3| 22405| 222834.0|
|846271|2014| 4| 3531| 226365.0|
|846271|2014| 5| 42436| 268801.0|
|846271|2014| 6| 81341| 350142.0|
|846271|2014| 7|120245| 470387.0|
|846271|2014| 8|159150| 629537.0|
|846271|2014| 9|198055| 827592.0|
|846271|2014| 10|236959| 1064551.0|
|846271|2014| 11|275864| 1340415.0|
|846271|2014| 12|314769| 1655184.0|
|846271|2015| 1|100215| 100215.0|
|846271|2015| 2|100216| 200431.0|
|846271|2015| 3| 23512| 223943.0|
|846271|2015| 4| 2055| 225998.0|
|846271|2015| 5| 19096| 245094.0|
|846271|2015| 6| 29897| 274991.0|
|846271|2015| 7| 28786| 303777.0|
|846271|2015| 8| 19096| 322873.0|
|846271|2015| 9| 29897| 352770.0|
|846271|2015| 10| 41000| 393770.0|
|846271|2015| 11| 29892| 423662.0|
|846271|2015| 12|308866| 732528.0|
|846271|2016| 1|100216| 100216.0|
|846271|2016| 2|100217| 200433.0|
|846271|2016| 3| 24620| 225053.0|
|846271|2016| 4| 579| 225632.0|
|846271|2016| 5| 38377| 264009.0|
|846271|2016| 6| 76175| 340184.0|
|846271|2016| 7| 28788| 368972.0|
|846271|2016| 8| 28786| 397758.0|
|846271|2016| 9| 19096| 416854.0|
|846271|2016| 10| 29897| 446751.0|
|846271|2016| 11| 41000| 487751.0|
|846271|2016| 12|302964| 790715.0|
|853101|2010| 1|100200| 100200.0|
|853101|2010| 2| 25002| 125202.0|
+------+----+-----+------+----------+
only showing top 50 rows
+------+----+-----+------+----------+
| burk|year|month|amount|sum_amount|
+------+----+-----+------+----------+
|846271|2012| 1|100212| 100212.0|
|846271|2012| 2|100213| 200425.0|
|846271|2012| 3| 20190| 220615.0|
|846271|2012| 4| 6484| 227099.0|
|846271|2012| 5| 46495| 273594.0|
|846271|2012| 6| 86506| 360100.0|
|846271|2012| 7|126518| 486618.0|
|846271|2012| 8|166529| 653147.0|
|846271|2012| 9|206540| 859687.0|
|846271|2012| 10|246551| 1106238.0|
|846271|2012| 11|286562| 1392800.0|
|846271|2012| 12|326573| 1719373.0|
|846271|2014| 1|100214| 100214.0|
|846271|2014| 2|100215| 200429.0|
|846271|2014| 3| 22405| 222834.0|
|846271|2014| 4| 3531| 226365.0|
|846271|2014| 5| 42436| 268801.0|
|846271|2014| 6| 81341| 350142.0|
|846271|2014| 7|120245| 470387.0|
|846271|2014| 8|159150| 629537.0|
+------+----+-----+------+----------+
only showing top 20 rows
+------+----+-----+------+-----------+--------------------+
| burk|year|month|amount|last_amount| incr|
+------+----+-----+------+-----------+--------------------+
|846271|2010| 1|100210| null| null|
|846271|2011| 1|100211| 100210|9.979999999965017E-6|
|846271|2012| 1|100212| 100211|9.979999999965017E-6|
|846271|2013| 1|100213| 100212|9.979999999965017E-6|
|846271|2014| 1|100214| 100213|9.979999999965017E-6|
|846271|2015| 1|100215| 100214|9.979999999965017E-6|
|846271|2016| 1|100216| 100215|9.979999999965017E-6|
|846271|2017| 1|100217| 100216|9.979999999965017E-6|
|846271|2018| 1|100218| 100217|9.979999999965017E-6|
|846271|2019| 1|100219| 100218|9.979999999965017E-6|
|846271|2010| 3| 17237| null| null|
|846271|2011| 3| 19451| 17237| 0.12844462|
|846271|2012| 3| 20190| 19451|0.037992910000000046|
|846271|2013| 3| 21297| 20190| 0.05482911999999995|
|846271|2014| 3| 22405| 21297| 0.0520261099999999|
|846271|2015| 3| 23512| 22405|0.049408609999999964|
|846271|2016| 3| 24620| 23512|0.047124869999999985|
|846271|2017| 3| 25727| 24620| 0.04496344000000008|
|846271|2018| 3| 26835| 25727| 0.04306758999999993|
|846271|2019| 3| 27942| 26835| 0.0412520999999999|
+------+----+-----+------+-----------+--------------------+
only showing top 20 rows
+------+----+-----+------+-----------+--------------------+
| burk|year|month|amount|last_amount| incr|
+------+----+-----+------+-----------+--------------------+
|846271|2010| 1|100210| null| null|
|846271|2011| 1|100211| 100210|9.979999999965017E-6|
|846271|2012| 1|100212| 100211|9.979999999965017E-6|
|846271|2013| 1|100213| 100212|9.979999999965017E-6|
|846271|2014| 1|100214| 100213|9.979999999965017E-6|
|846271|2015| 1|100215| 100214|9.979999999965017E-6|
|846271|2016| 1|100216| 100215|9.979999999965017E-6|
|846271|2017| 1|100217| 100216|9.979999999965017E-6|
|846271|2018| 1|100218| 100217|9.979999999965017E-6|
|846271|2019| 1|100219| 100218|9.979999999965017E-6|
|846271|2010| 3| 17237| null| null|
|846271|2011| 3| 19451| 17237| 0.12844462|
|846271|2012| 3| 20190| 19451|0.037992910000000046|
|846271|2013| 3| 21297| 20190| 0.05482911999999995|
|846271|2014| 3| 22405| 21297| 0.0520261099999999|
|846271|2015| 3| 23512| 22405|0.049408609999999964|
|846271|2016| 3| 24620| 23512|0.047124869999999985|
|846271|2017| 3| 25727| 24620| 0.04496344000000008|
|846271|2018| 3| 26835| 25727| 0.04306758999999993|
|846271|2019| 3| 27942| 26835| 0.0412520999999999|
+------+----+-----+------+-----------+--------------------+
only showing top 20 rows
题目1: 京东上会有很多店铺,每个顾客访客访问任何一个店铺的任何一个商品时都会产生一条访问日志, 访客的用户id为user_id 被访问的店铺名称为shop user_id shop u1 a u2 b u1 b u1 a u3 c u4 b u1 a u2 c u5 b u4 b u6 c u2 c u1 b u2 a u2 a u3 a u5 a u6 a 请统计:(使用DSL完成) 1)每个店铺的UV(访客数),输出店铺名称、访客数 2)每个店铺访问次数top3的访客信息,输出店铺名称、访客id、访问次数 题目2: 有如下的用户访问数据: userID visitDate visitCount u01 2022/01/21 5 u02 2022/01/23 6 u03 2022/01/22 8 u04 2022/01/20 3 u01 2022/01/23 6 u02 2022/01/23 6 u01 2022/02/21 8 u01 2022/02/22 4 u02 2022/02/22 6 u03 2022/02/23 9 u04 2022/02/20 5 u01 2022/03/21 5 u02 2022/03/23 6 u03 2022/03/22 6 u03 2022/03/24 9 u04 2022/04/20 5 u01 2022/04/21 2 u02 2022/04/23 3 u03 2022/04/22 4 u04 2022/05/20 7 要求: 使用Spark SQL以及DSL两种方式,统计出每个用户每个月的访问次数以及累计访问次数 输出格式如下所示: 用户 月份 小计 累计 u01 2022-01 11 11 u01 2022-02 12 23 ...... 解释: 小计为单月访问次数,累计为在原有单月访问次数基础上累加 将计算结果写入到mysql的表中,自己设计对应的表结构
package com.shujia.sql
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
object Demo08JdLogAnalysis {
def main(args: Array[String]): Unit = {
// 创建SparkSession
val spark: SparkSession = SparkSession
.builder()
.appName("Demo08JdLogAnalysis")
.master("local")
.config("spark.sql.shuffle.partitions", 1) // 默认200
.getOrCreate()
import org.apache.spark.sql.functions._
import spark.implicits._
// 读取jd_logs.txt数据
val jdLogsDF: DataFrame = spark
.read
.format("csv")
.option("sep", " ")
.schema("user_id String,shop String")
.load("Spark/data/jd_logs.txt")
// jdLogsDF.show()
/**
* 请统计:(使用DSL完成)
* 1)每个店铺的UV(访客数),输出店铺名称、访客数
* 2)每个店铺访问次数top3的访客信息,输出店铺名称、访客id、访问次数
*/
// 1)每个店铺的UV(访客数),输出店铺名称、访客数
jdLogsDF
.groupBy($"shop")
.agg(countDistinct($"user_id") as "cnt")
.show()
// 2)每个店铺访问次数top3的访客信息,输出店铺名称、访客id、访问次数
jdLogsDF
.groupBy($"shop", $"user_id")
.agg(count("*") as "cnt")
.select($"shop", $"user_id", $"cnt",
row_number() over Window.partitionBy($"shop").orderBy($"cnt".desc) as "rn")
.where($"rn" <= 3)
.select($"shop", $"user_id", $"cnt", $"rn")
.show()
// 读取jd_user_visit_cnt.txt数据
val jdUserVisitCntDF: DataFrame = spark
.read
.format("csv")
.option("sep", " ")
.schema("user_id String,date String,cnt Int")
.load("Spark/data/jd_user_visit_cnt.txt")
jdUserVisitCntDF.show()
// DSL
// 统计出每个用户每个月的访问次数以及累计访问次数
jdUserVisitCntDF
.select($"user_id", $"date".substr(6, 2) as "month", $"cnt")
.groupBy($"user_id", $"month")
.agg(sum($"cnt") as "total_cnt")
.select($"user_id", $"month", $"total_cnt",
sum($"total_cnt") over Window.partitionBy($"user_id").orderBy($"month") as "sum_cnt")
.write
.format("jdbc")
.option("truncate", "true") // 防止表结构发生改变 默认采取drop 加 create table 进行overwrite
.option("url", "jdbc:mysql://rm-bp1h7v927zia3t8iwho.mysql.rds.aliyuncs.com:3306/stu016?useSSL=false")
.option("dbtable", "jd_user_cnt")
.option("user", "shujia016")
.option("password", "123456")
.mode(SaveMode.Overwrite)
.save()
}
}
+----+---+
|shop|cnt|
+----+---+
| a| 5|
| b| 4|
| c| 3|
+----+---+
+----+-------+---+---+
|shop|user_id|cnt| rn|
+----+-------+---+---+
| a| u1| 3| 1|
| a| u2| 2| 2|
| a| u3| 1| 3|
| b| u1| 2| 1|
| b| u4| 2| 2|
| b| u2| 1| 3|
| c| u2| 2| 1|
| c| u3| 1| 2|
| c| u6| 1| 3|
+----+-------+---+---+
+-------+----------+---+
|user_id| date|cnt|
+-------+----------+---+
| u01|2022/01/21| 5|
| u02|2022/01/23| 6|
| u03|2022/01/22| 8|
| u04|2022/01/20| 3|
| u01|2022/01/23| 6|
| u02|2022/01/23| 6|
| u01|2022/02/21| 8|
| u01|2022/02/22| 4|
| u02|2022/02/22| 6|
| u03|2022/02/23| 9|
| u04|2022/02/20| 5|
| u01|2022/03/21| 5|
| u02|2022/03/23| 6|
| u03|2022/03/22| 6|
| u03|2022/03/24| 9|
| u04|2022/04/20| 5|
| u01|2022/04/21| 2|
| u02|2022/04/23| 3|
| u03|2022/04/22| 4|
| u04|2022/05/20| 7|
+-------+----------+---+