统计用户上网流量,如果两次上网的时间小于10min,就可以rollup到一起
源数据
id start_time end_time flow
1 14:20:30 14:46:30 20
1 14:47:20 15:20:30 30
1 15:37:23 16:05:26 40
1 16:06:27 17:20:49 50
1 17:21:50 18:03:27 60
2 14:18:24 15:01:40 20
2 15:20:49 15:30:24 30
2 16:01:23 16:40:32 40
2 16:44:56 17:40:52 50
3 14:39:58 15:35:53 20
3 15:36:39 15:24:54 30
sparksql的代码实现
object _13_SparkSql_Merge {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
val frame = spark
.read
.option("header","true")// 第一行当作表头
.option("inferSchema",true) //进行数据类型的推断
.option("sep"," ")//指定分隔符进行切分
.csv("data/sparksql/merge.dat")
frame.createTempView("flow_merge")
val dataFrame = spark.sql(
"""
select
| id,
| min(start_time) start_time,
| max(end_time) end_time,
| sum(flow) flow
|from
|(
| select
| id,
| start_time,
| end_time,
| sum(flag) over(partition by id order by start_time) as flags,
| flow
| from
| (
| select
| id,
| start_time,
| end_time,
| case when seconds is null then 0
| when seconds <= 10 * 60 then 0
| else 1
| end flag,
| flow
| from
| (
| select
| id,
| start_time,
| end_time,
| lag_start_time,
| lag_end_time,
| (unix_timestamp(start_time,'HH:mm:ss') - unix_timestamp(lag_end_time,'HH:mm:ss') ) as seconds,
| flow
| from
| (
| select
| id,
| start_time,
| end_time,
| lag(start_time,1) over(partition by id order by start_time ) lag_start_time, -- 把这行之前的数据拿到这一行
| lag(end_time,1) over(partition by id order by start_time ) lag_end_time, -- 把这行之前的数据拿到这一行
| flow
| from
| flow_merge
| order by id,start_time asc
| ) tmp_01
| ) tmp_02
| ) tmp_03
|) tmp_04
|group by id,flags
|order by
| id,start_time;
|""".stripMargin)
dataFrame.show()
spark.stop()
}
}
试想一下,如何使用RDD进行统计???
sql的实现过程
-- select (unix_timestamp('14:50:30','HH:mm:ss') - unix_timestamp('14:50:29','HH:mm:ss') ); 计算相差的秒数
-- 1、利用lag函数,把start_time|end_time的数据压到下一行
select
id,
start_time,
end_time,
lag(start_time,1) over(partition by id order by start_time ) lag_start_time, -- 把这行之前的数据拿到这一行
lag(end_time,1) over(partition by id order by start_time ) lag_end_time, -- 把这行之前的数据拿到这一行
flow
from
flow_merge
order by id,start_time asc;
+---+----------+--------+--------------+------------+
| id|start_time|end_time|lag_start_time|lag_end_time|
+---+----------+--------+--------------+------------+
| 1| 14:20:30|14:46:30| null| null|
| 1| 14:47:20|15:20:30| 14:20:30| 14:46:30|
| 1| 15:37:23|16:05:26| 14:47:20| 15:20:30|
| 1| 16:06:27|17:20:49| 15:37:23| 16:05:26|
| 1| 17:21:50|18:03:27| 16:06:27| 17:20:49|
| 2| 14:18:24|15:01:40| null| null|
| 2| 15:20:49|15:30:24| 14:18:24| 15:01:40|
| 2| 16:01:23|16:40:32| 15:20:49| 15:30:24|
| 2| 16:44:56|17:40:52| 16:01:23| 16:40:32|
| 3| 14:39:58|15:35:53| null| null|
| 3| 15:36:39|15:24:54| 14:39:58| 15:35:53|
+---+----------+--------+--------------+------------+
-- 2、计算差值
select
id,
start_time,
end_time,
lag_start_time,
lag_end_time,
(unix_timestamp(start_time,'HH:mm:ss') - unix_timestamp(lag_end_time,'HH:mm:ss') ) as seconds,
flow
from
(
select
id,
start_time,
end_time,
lag(start_time,1) over(partition by id order by start_time ) lag_start_time, -- 把这行之前的数据拿到这一行
lag(end_time,1) over(partition by id order by start_time ) lag_end_time, -- 把这行之前的数据拿到这一行
flow
from
flow_merge
order by id,start_time asc
) tmp_01;
+---+----------+--------+--------------+------------+-------+----+
| id|start_time|end_time|lag_start_time|lag_end_time|seconds|flow|
+---+----------+--------+--------------+------------+-------+----+
| 1| 14:20:30|14:46:30| null| null| null| 20| 0
| 1| 14:47:20|15:20:30| 14:20:30| 14:46:30| 50| 30| 0
| 1| 15:37:23|16:05:26| 14:47:20| 15:20:30| 1013| 40| 1
| 1| 16:06:27|17:20:49| 15:37:23| 16:05:26| 61| 50| 0
| 1| 17:21:50|18:03:27| 16:06:27| 17:20:49| 61| 60| 0
| 2| 14:18:24|15:01:40| null| null| null| 20| 0
| 2| 15:20:49|15:30:24| 14:18:24| 15:01:40| 1149| 30| 1
| 2| 16:01:23|16:40:32| 15:20:49| 15:30:24| 1859| 40| 1
| 2| 16:44:56|17:40:52| 16:01:23| 16:40:32| 264| 50| 0
| 3| 14:39:58|15:35:53| null| null| null| 20| 0
| 3| 15:36:39|15:24:54| 14:39:58| 15:35:53| 46| 30| 0
+---+----------+--------+--------------+------------+-------+----+
-- 3、打标记,如果小于10min为0或者是为空 否则为1
select
id,
start_time,
end_time,
case when seconds is null then 0
when seconds <= 10 * 60 then 0
else 1
end flag,
flow
from
(
select
id,
start_time,
end_time,
lag_start_time,
lag_end_time,
(unix_timestamp(start_time,'HH:mm:ss') - unix_timestamp(lag_end_time,'HH:mm:ss') ) as seconds,
flow
from
(
select
id,
start_time,
end_time,
lag(start_time,1) over(partition by id order by start_time ) lag_start_time, -- 把这行之前的数据拿到这一行
lag(end_time,1) over(partition by id order by start_time ) lag_end_time, -- 把这行之前的数据拿到这一行
flow
from
flow_merge
order by id,start_time asc
) tmp_01
) tmp_02;
+---+----------+--------+----+----+
| id|start_time|end_time|flag|flow|
+---+----------+--------+----+----+
| 1| 14:20:30|14:46:30| 0| 20|
| 1| 14:47:20|15:20:30| 0| 30|
| 1| 15:37:23|16:05:26| 1| 40|
| 1| 16:06:27|17:20:49| 0| 50|
| 1| 17:21:50|18:03:27| 0| 60|
| 2| 14:18:24|15:01:40| 0| 20|
| 2| 15:20:49|15:30:24| 1| 30|
| 2| 16:01:23|16:40:32| 1| 40|
| 2| 16:44:56|17:40:52| 0| 50|
| 3| 14:39:58|15:35:53| 0| 20|
| 3| 15:36:39|15:24:54| 0| 30|
+---+----------+--------+----+----+
-- 4、以uid为分区,再次开窗,用sum计算和
select
id,
start_time,
end_time,
sum(flag) over(partition by id order by start_time) as flags,
flow
from
(
select
id,
start_time,
end_time,
case when seconds is null then 0
when seconds <= 10 * 60 then 0
else 1
end flag,
flow
from
(
select
id,
start_time,
end_time,
lag_start_time,
lag_end_time,
(unix_timestamp(start_time,'HH:mm:ss') - unix_timestamp(lag_end_time,'HH:mm:ss') ) as seconds,
flow
from
(
select
id,
start_time,
end_time,
lag(start_time,1) over(partition by id order by start_time ) lag_start_time, -- 把这行之前的数据拿到这一行
lag(end_time,1) over(partition by id order by start_time ) lag_end_time, -- 把这行之前的数据拿到这一行
flow
from
flow_merge
order by id,start_time asc
) tmp_01
) tmp_02
) tmp_03;
+---+----------+--------+-----+----+
| id|start_time|end_time|flags|flow|
+---+----------+--------+-----+----+
| 1| 14:20:30|14:46:30| 0| 20|
| 1| 14:47:20|15:20:30| 0| 30|
| 1| 15:37:23|16:05:26| 1| 40|
| 1| 16:06:27|17:20:49| 1| 50|
| 1| 17:21:50|18:03:27| 1| 60|
| 3| 14:39:58|15:35:53| 0| 20|
| 3| 15:36:39|15:24:54| 0| 30|
| 2| 14:18:24|15:01:40| 0| 20|
| 2| 15:20:49|15:30:24| 1| 30|
| 2| 16:01:23|16:40:32| 2| 40|
| 2| 16:44:56|17:40:52| 2| 50|
+---+----------+--------+-----+----+
-- 4、以uid和flags分组,聚合得到结果
select
id,
min(start_time) start_time,
max(end_time) end_time,
sum(flow) flow
from
(
select
id,
start_time,
end_time,
sum(flag) over(partition by id order by start_time) as flags,
flow
from
(
select
id,
start_time,
end_time,
case when seconds is null then 0
when seconds <= 10 * 60 then 0
else 1
end flag,
flow
from
(
select
id,
start_time,
end_time,
lag_start_time,
lag_end_time,
(unix_timestamp(start_time,'HH:mm:ss') - unix_timestamp(lag_end_time,'HH:mm:ss') ) as seconds,
flow
from
(
select
id,
start_time,
end_time,
lag(start_time,1) over(partition by id order by start_time ) lag_start_time, -- 把这行之前的数据拿到这一行
lag(end_time,1) over(partition by id order by start_time ) lag_end_time, -- 把这行之前的数据拿到这一行
flow
from
flow_merge
order by id,start_time asc
) tmp_01
) tmp_02
) tmp_03
) tmp_04
group by id,flags
order by
id,start_time;
+---+----------+--------+----+
| id|start_time|end_time|flow|
+---+----------+--------+----+
| 1| 14:20:30|15:20:30| 50|
| 1| 15:37:23|18:03:27| 150|
| 2| 14:18:24|15:01:40| 20|
| 2| 15:20:49|15:30:24| 30|
| 2| 16:01:23|17:40:52| 90|
| 3| 14:39:58|15:35:53| 50|
+---+----------+--------+----+