uid,start_time,end_time,flow
1,2020-02-18 14:20:30,2020-02-18 14:46:30,20
1,2020-02-18 14:47:20,2020-02-18 15:20:30,30
1,2020-02-18 15:37:23,2020-02-18 16:05:26,40
1,2020-02-18 16:06:27,2020-02-18 17:20:49,50
1,2020-02-18 17:21:50,2020-02-18 18:03:27,60
2,2020-02-18 14:18:24,2020-02-18 15:01:40,20
2,2020-02-18 15:20:49,2020-02-18 15:30:24,30
2,2020-02-18 16:01:23,2020-02-18 16:40:32,40
2,2020-02-18 16:44:56,2020-02-18 17:40:52,50
3,2020-02-18 14:39:58,2020-02-18 15:35:53,20
3,2020-02-18 15:36:39,2020-02-18 15:24:54,30
dsl代码:
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SparkSession}
object GuildDslDemo {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.master("local[*]")
.getOrCreate()
val df: DataFrame = spark
.read
.option("header", "true")
.csv("data/liuliang.txt")
import spark.implicits._
import org.apache.spark.sql.functions._
df.select(
$"uid",
$"start_time",
$"end_time",
$"flow",
expr("lag(end_time,1, start_time )") over (Window.partitionBy($"uid") orderBy ($"start_time" asc)) as "lag_time"
).select(
$"uid",
$"start_time",
$"end_time",
$"flow",
expr("if(to_unix_timestamp(start_time,'yyyy-MM-dd HH:mm:ss')-to_unix_timestamp(lag_time,'yyyy-MM-dd HH:mm:ss') > 600,1,0)") as "flag"
).select(
$"uid",
$"start_time",
$"end_time",
$"flow",
sum($"flag") over (Window.partitionBy($"uid") orderBy ($"start_time") rowsBetween(Window.unboundedPreceding, Window.currentRow)) as "sum_flag"
).groupBy(
$"uid",
$"sum_flag"
).agg(
$"uid",
min($"start_time") as "start_time",
max($"end_time") as "end_time",
sum("flow")
).drop(
"sum_flag"
).show()
}
}