Spark-SQL 使用SQL和DSL 统计用户上网流量 案例

需求分析:

统计用户上网流量,如果两次上网的时间小于10分钟,就可以rollup到一起

uid,start_time,end_time,flow
1,2020-02-18 14:20:30,2020-02-18 14:46:30,20
1,2020-02-18 14:47:20,2020-02-18 15:20:30,30
1,2020-02-18 15:37:23,2020-02-18 16:05:26,40
1,2020-02-18 16:06:27,2020-02-18 17:20:49,50
1,2020-02-18 17:21:50,2020-02-18 18:03:27,60
2,2020-02-18 14:18:24,2020-02-18 15:01:40,20
2,2020-02-18 15:20:49,2020-02-18 15:30:24,30
2,2020-02-18 16:01:23,2020-02-18 16:40:32,40
2,2020-02-18 16:44:56,2020-02-18 17:40:52,50
3,2020-02-18 14:39:58,2020-02-18 15:35:53,20
3,2020-02-18 15:36:39,2020-02-18 15:24:54,30


--向下推一格


select
  uid,
  start_time,
  end_time,
  flow,
  lag(end_time,1,start_time) over(partition by uid order by start_time) lag_time
from
  t_int
  
  +---+-------------------+-------------------+----+-------------------+
|uid|         start_time|           end_time|flow|           lag_time|
+---+-------------------+-------------------+----+-------------------+
|  3|2020-02-18 14:39:58|2020-02-18 15:35:53|  20|               null|
|  3|2020-02-18 15:36:39|2020-02-18 15:24:54|  30|2020-02-18 15:35:53|
|  1|2020-02-18 14:20:30|2020-02-18 14:46:30|  20|               null|
|  1|2020-02-18 14:47:20|2020-02-18 15:20:30|  30|2020-02-18 14:46:30|
|  1|2020-02-18 15:37:23|2020-02-18 16:05:26|  40|2020-02-18 15:20:30|
|  1|2020-02-18 16:06:27|2020-02-18 17:20:49|  50|2020-02-18 16:05:26|
|  1|2020-02-18 17:21:50|2020-02-18 18:03:27|  60|2020-02-18 17:20:49|
|  2|2020-02-18 14:18:24|2020-02-18 15:01:40|  20|               null|
|  2|2020-02-18 15:20:49|2020-02-18 15:30:24|  30|2020-02-18 15:01:40|
|  2|2020-02-18 16:01:23|2020-02-18 16:40:32|  40|2020-02-18 15:30:24|
|  2|2020-02-18 16:44:56|2020-02-18 17:40:52|  50|2020-02-18 16:40:32|
+---+-------------------+-------------------+----+-------------------+

--用lag_time 减去 start_time 如果差值大于10分钟 返回1,小于10分钟返回0

select
  uid,
  start_time,
  end_time,
  flow,
  if((to_unix_timestamp(start_time)-to_unix_timestamp(lag_time))/60>10,1,0) flag
from
(
  select
    uid,
    start_time,
    end_time,
    flow,
    lag(end_time,1,start_time) over(partition by uid order by start_time) lag_time
  from
    t_int
)t1


+---+-------------------+-------------------+----+----+
|uid|         start_time|           end_time|flow|flag|
+---+-------------------+-------------------+----+----+
|  3|2020-02-18 14:39:58|2020-02-18 15:35:53|  20|   0|
|  3|2020-02-18 15:36:39|2020-02-18 15:24:54|  30|   0|
|  1|2020-02-18 14:20:30|2020-02-18 14:46:30|  20|   0|
|  1|2020-02-18 14:47:20|2020-02-18 15:20:30|  30|   0|
|  1|2020-02-18 15:37:23|2020-02-18 16:05:26|  40|   1|
|  1|2020-02-18 16:06:27|2020-02-18 17:20:49|  50|   0|
|  1|2020-02-18 17:21:50|2020-02-18 18:03:27|  60|   0|
|  2|2020-02-18 14:18:24|2020-02-18 15:01:40|  20|   0| 
|  2|2020-02-18 15:20:49|2020-02-18 15:30:24|  30|   1| 
|  2|2020-02-18 16:01:23|2020-02-18 16:40:32|  40|   1| 
|  2|2020-02-18 16:44:56|2020-02-18 17:40:52|  50|   0| 
+---+-------------------+-------------------+----+----+

--根据窗口函数聚合flag
select
  uid,
  start_time,
  end_time,
  flow,
  sum(flag) over(partition by uid order by start_time) sum_flag
from
(
  select
    uid,
    start_time,
    end_time,
    flow,
    if((to_unix_timestamp(start_time)-to_unix_timestamp(lag_time))/60>10,1,0) flag
  from
  (
    select
      uid,
      start_time,
      end_time,
      flow,
      lag(end_time,1,start_time) over(partition by uid order by start_time) lag_time
    from
      t_int
  )t1
)t2


+---+-------------------+-------------------+----+--------+
|uid|         start_time|           end_time|flow|sum_flag|
+---+-------------------+-------------------+----+--------+
|  3|2020-02-18 14:39:58|2020-02-18 15:35:53|  20|       0|
|  3|2020-02-18 15:36:39|2020-02-18 15:24:54|  30|       0|
|  1|2020-02-18 14:20:30|2020-02-18 14:46:30|  20|       0|
|  1|2020-02-18 14:47:20|2020-02-18 15:20:30|  30|       0|
|  1|2020-02-18 15:37:23|2020-02-18 16:05:26|  40|       1|
|  1|2020-02-18 16:06:27|2020-02-18 17:20:49|  50|       1|
|  1|2020-02-18 17:21:50|2020-02-18 18:03:27|  60|       1|
|  2|2020-02-18 14:18:24|2020-02-18 15:01:40|  20|       0|
|  2|2020-02-18 15:20:49|2020-02-18 15:30:24|  30|       1|
|  2|2020-02-18 16:01:23|2020-02-18 16:40:32|  40|       2|
|  2|2020-02-18 16:44:56|2020-02-18 17:40:52|  50|       2|
+---+-------------------+-------------------+----+--------+

--累加flow

select
  uid,
  min(start_time) start_time,
  max(end_time) end_time,
  sum(flow) sum_flow
from
(
  select
    uid,
    start_time,
    end_time,
    flow,
    sum(flag) over(partition by uid order by start_time) sum_flag
  from
  (
    select
      uid,
      start_time,
      end_time,
      flow,
      if((to_unix_timestamp(start_time)-to_unix_timestamp(lag_time))/60>10,1,0) flag
    from
    (
      select
        uid,
        start_time,
        end_time,
        flow,
        lag(end_time,1,start_time) over(partition by uid order by start_time) lag_time
      from
        t_int
    )t1
  )t2
)t3
group by uid,sum_flag

+---+-------------------+-------------------+--------+
|uid|         start_time|           end_time|sum_flow|
+---+-------------------+-------------------+--------+
|  3|2020-02-18 14:39:58|2020-02-18 15:35:53|    50.0|
|  1|2020-02-18 14:20:30|2020-02-18 15:20:30|    50.0|
|  1|2020-02-18 15:37:23|2020-02-18 18:03:27|   150.0|
|  2|2020-02-18 14:18:24|2020-02-18 15:01:40|    20.0|
|  2|2020-02-18 15:20:49|2020-02-18 15:30:24|    30.0|
|  2|2020-02-18 16:01:23|2020-02-18 17:40:52|    90.0|
+---+-------------------+-------------------+--------+

1.使用SQL

import org.apache.spark.sql.{DataFrame, SparkSession}

object SQLFlowRollupDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    val df: DataFrame = spark.read
      .option("header", "true")
      .csv("src/main/scala/data/data.csv")

    df.createTempView("t_int")

    val res = spark.sql(
      """
        |
        |select
        |  uid,
        |  min(start_time) start_time,
        |  max(end_time) end_time,
        |  sum(flow) sum_flow
        |from
        |(
        |  select
        |    uid,
        |    start_time,
        |    end_time,
        |    flow,
        |    sum(flag) over(partition by uid order by start_time) sum_flag
        |  from
        |  (
        |    select
        |      uid,
        |      start_time,
        |      end_time,
        |      flow,
        |      if((to_unix_timestamp(start_time)-to_unix_timestamp(lag_time))/60>10,1,0) flag
        |    from
        |    (
        |      select
        |        uid,
        |        start_time,
        |        end_time,
        |        flow,
        |        lag(end_time,1,start_time) over(partition by uid order by start_time) lag_time
        |      from
        |        t_int
        |    )t1
        |  )t2
        |)t3
        |group by uid,sum_flag
        |
        |""".stripMargin)

    res.show()

    spark.stop()
  }

}

2.使用DSL

import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.{DataFrame, SparkSession}

object DSLFlowRollupDemo {
  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    val df: DataFrame = spark.read
      .option("header", "true")
      .csv("src/main/scala/data/data.csv")

    import spark.implicits._
    import org.apache.spark.sql.functions._
    val res = df.select('uid,
      'start_time,
      'end_time,
      'flow,
      expr("lag(end_time,1,start_time)")
        over(Window.partitionBy('uid) orderBy("start_time"))as "lag_time")
      .select('uid,'start_time,'end_time,'flow,
      expr("if((to_unix_timestamp(start_time)-to_unix_timestamp(lag_time))/60 > 10,1,0)") as "flag"
      )
      .select('uid,'start_time,'end_time,'flow,
        sum('flag) over(Window.partitionBy("uid") orderBy("start_time")) as "sum_flag"
      )
      .groupBy("uid","sum_flag")
      .agg('uid,
        min("start_time") as "start_time",
        max("end_time") as "end_time",
        sum('flow)
      )

    res.show()

    spark.stop()
  }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
大致功能: 1、客户资料维护(新增、删除、修改等),参考Customer表、CustNumber表、Route表,其中客户编号是关系索引。 2、国家基本信息维护,参考Country表。 3、数据来源:XXX.xls,可导入至asscess数据库中,如DataSheet表。其中“CALLED_STATION_ID”字段前6个字符代表接入号,其余为对方号码(包括国际长途国家代号),可以根据国家基本信息提取国家代号。接入号是确定客户的关键字,一个客户允许有多个接入号。 4、查询操作,条件输入:日期、客户编号/客户名称、接入号、路由IP地址、国家代号,以上输入信息从基本表中选择,不输入表示该字段内容不进行条件过滤。举例:输入客户编号自动显示客户名称,接入号为对应的客户接入号,用下拉列表显示,允许选空,表示该客户的所有接入号,路由IP地址及国家代号用来做过滤条件,允许为空。 5、对Route表的补充说明,这是一个客户允许的路由IP地址,其中每一个地址对应一种计费规则(Rule字段),共有三种计费规则,分别代表DataSheet表中的三个字段(MIN306、MIN06、MIN60均为分钟数),统计时根据路由IP的计费规则,取相应的字段。 6、本说明中未做说明的DataSheet字段可以不加理会,用到的字段有:LOGGEDAT、NAS_IP_ADDRESS、CALLED_STATION_ID、MIN306、MIN06、MIN60。 7、输出的报表格式如下,该报表可保存为Excel文件,也可直接发送给客户(通过Email)。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值