经典案例之统计用户上网流量(DSL实现)

统计用户上网流量,如果两次上网的时间小于10min,就rollup到一起

数据

id start_time end_time flow
1 14:20:30 14:46:30 20
1 14:47:20 15:20:30 30
1 15:37:23 16:05:26 40
1 16:06:27 17:20:49 50
1 17:21:50 18:03:27 60
2 14:18:24 15:01:40 20
2 15:20:49 15:30:24 30
2 16:01:23 16:40:32 40
2 16:44:56 17:40:52 50
3 14:39:58 15:35:53 20
3 15:36:39 15:24:54 30

DSL实现

object _05_DSL_Merge {
  def main(args: Array[String]): Unit = {
    //获取session对象
    val spark: SparkSession = SparkSession.builder().master("local[*]").getOrCreate()
    //从csv文件读数据
    val dataFrame: DataFrame = spark.read
      .option("header", "true")
      .option("inferSchema", true)
      .option("sep", " ")
      .csv("data/sparksql/merge.dat")

    //利用DSL风格进行流量的统计
    //导入隐式转换
    import spark.implicits._
    import  org.apache.spark.sql.functions._

    val value = dataFrame.select(
      $"id",
      $"start_time",
      $"end_time",
      expr("lag(end_time,1,start_time)") over (Window.partitionBy($"id") orderBy ($"start_time")) as "lag_time",
      $"flow"
    ).select(
      $"id",
      $"start_time",
      $"end_time",
      expr("if( ( to_unix_timestamp(start_time,'HH:mm:ss') - to_unix_timestamp(lag_time,'HH:mm:ss') ) > 10 * 60, 1 , 0)") as "flag",
      $"flow"
    ).select(
      $"id",
      $"start_time",
      $"end_time",
      sum("flag") over (Window.partitionBy($"id") orderBy ($"start_time")
        rowsBetween(Window.unboundedPreceding, Window.currentRow)) as "flags",
      $"flow"
    ).groupBy(
      $"id",
      $"flags"
    ).agg(
      min($"start_time") as "start_time",
      max($"end_time") as "end_time",
      sum($"flow") as "flow"
    ).drop($"flags")
      .orderBy($"id" asc)

    value.show()
    spark.stop()
  }
}

结果

+---+----------+--------+----+
| id|start_time|end_time|flow|
+---+----------+--------+----+
|  1|  14:20:30|15:20:30|  50|
|  1|  15:37:23|18:03:27| 150|
|  2|  14:18:24|15:01:40|  20|
|  2|  15:20:49|15:30:24|  30|
|  2|  16:01:23|17:40:52|  90|
|  3|  14:39:58|15:35:53|  50|
+---+----------+--------+----+
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值