第1关:统计撤销订单中撤销理由最多的前10种理由
任务描述
-
使用 Spark 统计撤销订单中撤销理由最多的前 10 种理由(因撤销理由为
未知
的数据过多,统计时不包含撤销理由值未知
的数据)。数据集所在位置:/data/workspace/myshixun/data/canceldata.txt
,数据集文件字段之间以|
分割,文件部分数据展示如下:1200DDCX3307|430104|湖南省长沙市岳麓区|17625076885092|2019-03-07 17:32:27|2019-03-07 17:38:33|2|5|未知 1100YDYC423D|430602|湖南省岳阳市岳阳楼区|6665578474529331090|2019-03-07 17:28:46|2019-03-07 17:29:09|1|1|第三方接口取消 shouyue|430100|湖南省长沙市|P190307171256186000|2019-03-07 17:12:55|2019-03-07 17:13:48|1|1|点击下单120S内没有筛选到司机时, 乘客手动点击取消订单
-
将统计结果存放在 MySQL 数据库
mydb
的cancelreason
表中(表已经提前创建)。
/********** Begin **********/
Dataset<Row> moviesData=spark.read().option("delimiter","|").csv("/data/workspace/myshixun/data/canceldata.txt").toDF("companyid","address","districtname","orderid","ordertime","canceltime","operator","canceltypecode","cancelreason");
moviesData.registerTempTable("data");
spark.sql("select cancelreason,count(*) num from data where cancelreason!='未知' group by cancelreason order by num desc limit 10").write()
.format("jdbc")
.option("url","jdbc:mysql://127.0.0.1:3306/mydb?useUnicode=true&characterEncoding=utf-8")
.option("dbtable","cancelreason")
.option("user","root")
.option("password","123123")
.mode(SaveMode.Append).save();
/********** End **********/
第2关:查询出成功订单最多的10个地区名
任务描述
-
使用 Spark 统计成功订单最多的 10 个行政区名;数据集所在位置:
/data/workspace/myshixun/data/createdata.txt
,数据集文件字段之间以\t
分割,文件部分数据展示如下:1200DDCX3307 431081 湖南省郴州市资兴市 17625036018008 2019-03-07 07:32:20 2019-03-07 07:32:20 S213(旧)|威狮轮胎 113.247606 25.968607 资兴市.|唐洞加油站 113.251180 25.979303 1200DDCX3307 430111 湖南省长沙市雨花区 17625036099910 2019-03-07 07:31:39 2019-03-07 07:31:39 嘉盛华庭3期(西2门) 113.032280 28.162031 长沙东站树木岭货场 113.010107 28.166197 1200DDCX3307 431122 湖南省永州市东安县 35194606833503 2019-03-07 07:32:05 2019-03-07 07:32:06 东安大道.|潇湘第一城南侧 111.327802 26.391911 东安县.人力资源和社会保障局 111.317184 26.395052
-
将统计结果存放在 MySQL 数据库
mydb
的order_district
表中(表已经提前创建)。
/********** Begin **********/
Dataset<Row> orderData = spark.read().option("delimiter", "\t").csv("/data/workspace/myshixun/data/createdata.txt").toDF("companyid","address","districtname","orderid","departtime","ordertime","departure","deplongitude","deplatitude","destination","destlongitude","destlatitude");
orderData.registerTempTable("data");
spark.sql("select districtname,count(*) num from data group by districtname order by num desc limit 10")
.write()
.format("jdbc")
.option("url","jdbc:mysql://127.0.0.1:3306/mydb?useUnicode=true&characterEncoding=utf-8")
.option("dbtable","order_district")
.option("user","root")
.option("password","123123")
.mode(SaveMode.Append)
.save();
/********** End **********/
第3关:查询订单线路中出行次数最多的五条线路
任务描述
-
使用 Spark 统计成功订单线路中出行次数最多的五条线路。。数据集所在位置:
/data/workspace/myshixun/data/createdata.txt
,数据集文件字段之间以\t
分割,文件部分数据展示如下:1200DDCX3307 431081 湖南省郴州市资兴市 17625036018008 2019-03-07 07:32:20 2019-03-07 07:32:20 S213(旧)|威狮轮胎 113.247606 25.968607 资兴市.|唐洞加油站 113.251180 25.979303 1200DDCX3307 430111 湖南省长沙市雨花区 17625036099910 2019-03-07 07:31:39 2019-03-07 07:31:39 嘉盛华庭3期(西2门) 113.032280 28.162031 长沙东站树木岭货场 113.010107 28.166197 1200DDCX3307 431122 湖南省永州市东安县 35194606833503 2019-03-07 07:32:05 2019-03-07 07:32:06 东安大道.|潇湘第一城南侧 111.327802 26.391911 东安县.人力资源和社会保障局 111.317184 26.395052
-
将统计结果存放在 MySQL 数据库
mydb
的orderline
表中(表已经提前创建)。
/********** Begin **********/
Dataset<Row> lineData=spark.read().option("delimiter","\t").csv("/data/workspace/myshixun/data/createdata.txt").toDF("companyid","address","districtname","orderid","departtime","ordertime","departure","deplongitude","deplatitude","destination","destlongitude","destlatitude");
lineData.registerTempTable("data");
spark.udf().register("compare", (UDF1<String, String>) s -> {
String ss = "";
int i = s.split("\\*")[0].compareTo(s.split("\\*")[1]);
if (s.split("\\*").length == 2) {
if (i >= 0) {
ss = s.split("\\*")[0] + "*" + s.split("\\*")[1];
} else {
ss = s.split("\\*")[1] + "*" + s.split("\\*")[0];
}
} else if (s.split("\\*").length == 6) {
if (i >= 0) {
ss = s.split("\\*")[0] + "*" + s.split("\\*")[1] + "*" + s.split("\\*")[2] + "*" + s.split("\\*")[3] + "*" + s.split("\\*")[4] + "*" + s.split("\\*")[5];
} else {
ss = s.split("\\*")[1] + "*" + s.split("\\*")[0] + "*" + s.split("\\*")[4] + "*" + s.split("\\*")[5] + "*" + s.split("\\*")[2] + "*" + s.split("\\*")[3];
}
}
return ss;
}, DataTypes.StringType);
spark.sql("select compare(concat_ws('*',departure,destination))line,count(*) num from data where departure is not null and destination is not null group by compare(concat_ws('*',departure,destination)) order by num desc limit 5")
.registerTempTable("t1");
spark.sql("select concat_ws('*',split(compare(concat_ws('*',departure,destination,deplongitude,deplatitude,destlongitude,destlatitude)),'[*]')[0],split(compare(concat_ws('*',departure,destination,deplongitude,deplatitude,destlongitude,destlatitude)),'[*]')[1])line,compare(concat_ws('*',departure,destination,deplongitude,deplatitude,destlongitude,destlatitude)) bb,count(*) num from data where departure is not null and destination is not null group by compare(concat_ws('*',departure,destination,deplongitude,deplatitude,destlongitude,destlatitude)) order by num desc").registerTempTable("t2");
spark.sql("select split(bb,'[*]')[0] departure,split(bb,'[*]')[2] deplongitude,split(bb,'[*]')[3] deplatitude,split(bb,'[*]')[1] destination,split(bb,'[*]')[4] destlongitude,split(bb,'[*]')[5] destlatitude,num from(select t1.line,t2.bb,t2.num count,t1.num, Row_Number() OVER (partition by t1.line ORDER BY t2.num desc) rank from t1 left join t2 on t1.line = t2.line order by t1.num desc) where rank=1") .write()
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/mydb?useUnicode=true&characterEncoding=utf-8")
.option("dbtable", "orderline")
.option("user", "root")
.option("password", "123123")
.mode(SaveMode.Append)
.save();
/********** End **********/
第4关:湖南各个市的所有订单总量
任务描述
- 使用 Spark 统计湖南省各个市的所有订单总量(订单总量=撤销订单+成功订单)。例如“湖南省长沙市岳麓区”与“湖南省长沙市雨花区”这些订单都是属于“长沙市”。数据集所在位置:
/data/workspace/myshixun/data/createdata.txt
,数据集文件字段之间以\t
分割,文件部分数据展示如下:1200DDCX3307 431081 湖南省郴州市资兴市 17625036018008 2019-03-07 07:32:20 2019-03-07 07:32:20 S213(旧)|威狮轮胎 113.247606 25.968607 资兴市.|唐洞加油站 113.251180 25.979303 1200DDCX3307 430111 湖南省长沙市雨花区 17625036099910 2019-03-07 07:31:39 2019-03-07 07:31:39 嘉盛华庭3期(西2门) 113.032280 28.162031 长沙东站树木岭货场 113.010107 28.166197 1200DDCX3307 431122 湖南省永州市东安县 35194606833503 2019-03-07 07:32:05 2019-03-07 07:32:06 东安大道.|潇湘第一城南侧 111.327802 26.391911 东安县.人力资源和社会保障局 111.317184 26.395052
/data/workspace/myshixun/data/canceldata.txt
,数据集文件字段之间以|
分割,文件部分数据展示如下:
1200DDCX3307|430104|湖南省长沙市岳麓区|17625076885092|2019-03-07 17:32:27|2019-03-07 17:38:33|2|5|未知
1100YDYC423D|430602|湖南省岳阳市岳阳楼区|6665578474529331090|2019-03-07 17:28:46|2019-03-07 17:29:09|1|1|第三方接口取消
shouyue|430100|湖南省长沙市|P190307171256186000|2019-03-07 17:12:55|2019-03-07 17:13:48|1|1|点击下单120S内没有筛选到司机时, 乘客手动点击取消订单
- 将统计结果存放在 MySQL 数据库
mydb
的orderbycity
表中(表已经提前创建)。
/********** Begin **********/
Dataset<Row> orderData=spark.read().option("delimiter","\t").csv("/data/workspace/myshixun/data/createdata.txt").toDF("companyid", "address", "districtname", "orderid","departtime", "ordertime", "departure", "deplongitude", "deplatitude", "destination","destlongitude", "destlatitude");
orderData.registerTempTable("data1");
Dataset<Row> cancelData = spark.read().option("delimiter", "|").csv("/data/workspace/myshixun/data/canceldata.txt").toDF("companyid", "address", "districtname", "orderid","ordertime", "canceltime", "operator","canceltypecode", "cancelreason");
cancelData.registerTempTable("data2");
spark.udf().register("city", (UDF1<String, String>) s -> {
String city = "";
if (s.contains("自治州")){
city = s.split("自治州")[0] + "自治州";
}else {
city = s.split("市")[0] + "市";
}
return city;
}, DataTypes.StringType);
spark.sql("select city(districtname) city,count(*) count from data1 where districtname like '湖南省%' group by city(districtname)").registerTempTable("order");
spark.sql("select city(districtname) city,count(*) count from data2 where districtname like '湖南省%' group by city(districtname)").registerTempTable("cancel");
spark.sql("select order.city,(order.count+cancel.count) num from order left join cancel on order.city == cancel.city order by num desc")
.write()
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/mydb?useUnicode=true&characterEncoding=utf-8")
.option("dbtable", "orderbycity")
.option("user", "root")
.option("password", "123123")
.mode(SaveMode.Append)
.save();
/********** End **********/
第5关:统计湖南省当天的各时间段订单总数量与各市级当天各时间段订单总数量
任务描述
- 使用 Spark 查询湖南省 2019-03-07 日的每分钟订单总数量(以订单时间 ordertime 为依据),查询出来的时间格式为 “yyyy-MM-dd HH:mm”如“2019-03-07 15:37”,将统计结果存放在 MySQL 数据库
mydb
的order_quantity_time
表中(表已经提前创建);
MySQL 查询的数据格式示例如下:
time num
2019-03-07 00:00 263
2019-03-07 00:01 285
- 使用 Spark 查询湖南省 2019-03-07 日各市级当天每小时订单总数量(以订单时间 ordertime 为依据),并将查询结果存放在 MySQL 数据库
mydb
的order_city_hour
表中。
时间段处理:
时间段 | 对应的数字 |
---|---|
[0点-1点) | 0 |
[1点-2点) | 1 |
[2点-3点) | 2 |
... | ... |
[22点-23点) | 22 |
[23点-24点) | 23 |
MySQL 查询的数据格式示例如下:
hour city num
0 湖南省娄底市 136
数据集所在位置: /data/workspace/myshixun/data/createdata.txt
,数据集文件字段之间以\t
分割,文件部分数据展示如下:
1200DDCX3307 431081 湖南省郴州市资兴市 17625036018008 2019-03-07 07:32:20 2019-03-07 07:32:20 S213(旧)|威狮轮胎 113.247606 25.968607 资兴市.|唐洞加油站 113.251180 25.979303
1200DDCX3307 430111 湖南省长沙市雨花区 17625036099910 2019-03-07 07:31:39 2019-03-07 07:31:39 嘉盛华庭3期(西2门) 113.032280 28.162031 长沙东站树木岭货场 113.010107 28.166197
1200DDCX3307 431122 湖南省永州市东安县 35194606833503 2019-03-07 07:32:05 2019-03-07 07:32:06 东安大道.|潇湘第一城南侧 111.327802 26.391911 东安县.人力资源和社会保障局 111.317184 26.395052
/data/workspace/myshixun/data/canceldata.txt
,数据集文件字段之间以|
分割,文件部分数据展示如下:
1200DDCX3307|430104|湖南省长沙市岳麓区|17625076885092|2019-03-07 17:32:27|2019-03-07 17:38:33|2|5|未知
1100YDYC423D|430602|湖南省岳阳市岳阳楼区|6665578474529331090|2019-03-07 17:28:46|2019-03-07 17:29:09|1|1|第三方接口取消
shouyue|430100|湖南省长沙市|P190307171256186000|2019-03-07 17:12:55|2019-03-07 17:13:48|1|1|点击下单120S内没有筛选到司机时, 乘客手动点击取消订单
/********** Begin **********/
Dataset<Row> orderData = spark.read().option("delimiter", "\t").csv("/data/workspace/myshixun/data/createdata.txt")
.toDF("companyid", "address", "districtname", "orderid","departtime", "ordertime", "departure", "deplongitude", "deplatitude", "destination","destlongitude", "destlatitude");
orderData.registerTempTable("data1");
Dataset<Row> cancelData = spark.read().option("delimiter", "|").csv("/data/workspace/myshixun/data/canceldata.txt")
.toDF("companyid", "address", "districtname", "orderid", "ordertime", "canceltime", "operator", "canceltypecode", "cancelreason");
cancelData.registerTempTable("data2");
spark.udf().register("city", (UDF1<String, String>) s -> {
String city = "";
if (s.contains("自治州")) {
city = s.split("自治州")[0] + "自治州";
} else {
city = s.split("市")[0] + "市";
}
return city;
}, DataTypes.StringType);
spark.sql("select hour(ordertime) hour,city(districtname)city,count(*) count from data2 where districtname like '湖南省%' group by hour(ordertime),city(districtname) order by hour").registerTempTable("t1");
spark.sql("select hour(ordertime) hour,city(districtname)city,count(*) count from data1 where districtname like '湖南省%' group by hour(ordertime),city(districtname) order by hour").registerTempTable("t2");
spark.sql("select (case when t1.hour is null then t2.hour when t2.hour is null then t1.hour else t2.hour end)hour,(case when t1.city is null then t2.city when t2.city is null then t1.city else t2.city end)city,(case when t1.count is null then t2.count when t2.count is null then t1.count else t2.count+t1.count end)num from t1 full join t2 on concat_ws('*',t1.hour,t1.city) = concat_ws('*',t2.hour,t2.city) order by hour,city")
.write()
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/mydb?useUnicode=true&characterEncoding=utf-8")
.option("dbtable", "order_city_hour")
.option("user", "root")
.option("password", "123123")
.mode(SaveMode.Append)
.save();
spark.sql("select (case when t1 is null then t2 when t2 is null then t1 else t2 end) as time ,(case when count1 is null then count2 when count2 is null then count1 else count2+count1 end) as num from(select * from (SELECT DATE_FORMAT(ordertime,'yyyy-MM-dd HH:mm') as t1,count(DATE_FORMAT(ordertime,'yyyy-MM-dd HH:mm')) as count1 FROM data1 GROUP BY DATE_FORMAT(ordertime,'yyyy-MM-dd HH:mm')) as a FULL OUTER JOIN (SELECT DATE_FORMAT(ordertime,'yyyy-MM-dd HH:mm') as t2,count(DATE_FORMAT(ordertime,'yyyy-MM-dd HH:mm')) as count2 FROM data2 GROUP BY DATE_FORMAT(ordertime,'yyyy-MM-dd HH:mm')) as b on a.t1=b.t2) as c order by time")
.write()
.format("jdbc")
.option("url", "jdbc:mysql://127.0.0.1:3306/mydb?useUnicode=true&characterEncoding=utf-8")
.option("dbtable", "order_quantity_time")
.option("user", "root")
.option("password", "123123")
.mode(SaveMode.Append)
.save();
/********** End **********/