统计用户上网流量,如果两次上网的时间小于10min,就rollup到一起
数据
id start_time end_time flow
1 14:20:30 14:46:30 20
1 14:47:20 15:20:30 30
1 15:37:23 16:05:26 40
1 16:06:27 17:20:49 50
1 17:21:50 18:03:27 60
2 14:18:24 15:01:40 20
2 15:20:49 15:30:24 30
2 16:01:23 16:40:32 40
2 16:44:56 17:40:52 50
3 14:39:58 15:35:53 20
3 15:36:39 15:24:54 30
DSL实现
object _05_DSL_Merge {
def main(args: Array[String]): Unit = {
//获取session对象
val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
//从csv文件读数据
val dataFrame: DataFrame = spark.read
.option("header", "true")
.option("inferSchema", true)
.option("sep", " ")
.csv("data/sparksql/merge.dat")
//利用DSL风格进行流量的统计
//导入隐式转换
import spark.implicits._
import org.apache.spark.sql.functions._
val value = dataFrame.select(
$"id",
$"start_time",
$"end_time",
expr("lag(end_time,1,start_time)") over (Window.partitionBy($"id") orderBy ($"start_time")) as "lag_time",
$"flow"
).select(
$"id",
$"start_time",
$"end_time",
expr("if( ( to_unix_timestamp(start_time,'HH:mm:ss') - to_unix_timestamp(lag_time,'HH:mm:ss') ) > 10 * 60, 1 , 0)") as "flag",
$"flow"
).select(
$"id",
$"start_time",
$"end_time",
sum("flag") over (Window.partitionBy($"id") orderBy ($"start_time")
rowsBetween(Window.unboundedPreceding, Window.currentRow)) as "flags",
$"flow"
).groupBy(
$"id",
$"flags"
).agg(
min($"start_time") as "start_time",
max($"end_time") as "end_time",
sum($"flow") as "flow"
).drop($"flags")
.orderBy($"id" asc)
value.show()
spark.stop()
}
}
结果
+---+----------+--------+----+
| id|start_time|end_time|flow|
+---+----------+--------+----+
| 1| 14:20:30|15:20:30| 50|
| 1| 15:37:23|18:03:27| 150|
| 2| 14:18:24|15:01:40| 20|
| 2| 15:20:49|15:30:24| 30|
| 2| 16:01:23|17:40:52| 90|
| 3| 14:39:58|15:35:53| 50|
+---+----------+--------+----+