sparksql经典案例之连续登录问题(求每个用户连续登录最大的天数,最少登录3天)

求每个用户连续登录最大的天数,最少登录3天

结果

最终的结果如下:
+------+-----+----------+----------+
|   uid|times|start_date|  end_date|
+------+-----+----------+----------+
|guid01|    4|2018-03-04|2018-03-07|
|guid02|    3|2018-03-01|2018-03-03|
|guid03|    3|2018-03-09|2018-03-11|
|guid04|    3|2018-03-05|2018-03-07|
+------+-----+----------+----------+

源数据

uid,dt
guid01,2018-02-28  
guid01,2018-03-01
guid01,2018-03-01
guid01,2018-03-02
guid01,2018-03-05
guid01,2018-03-04
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-02
guid02,2018-03-03
guid02,2018-03-06
guid03,2018-03-06
guid03,2018-03-07
guid03,2018-03-09
guid03,2018-03-10
guid03,2018-03-11
guid04,2018-03-05
guid04,2018-03-06
guid04,2018-03-07
guid04,2018-03-09

代码实现

object _07_SQL_login {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder().master("local[*]").getOrCreate()
    val sc: SparkContext = spark.sparkContext
    val rdd1: RDD[String] = sc.textFile("data/sparksql/data1.txt", 2)

    val rowRDD: RDD[Row] = rdd1.map(line => {
      val arr = line.split(",")
      Row(arr(0), arr(1))
    })

    val schema = StructType(
      List(
        StructField("uid", DataTypes.StringType),
        StructField("dt", DataTypes.StringType)
      )
    )

    val dataFrame = spark.createDataFrame(rowRDD, schema)
    //创建视图
    dataFrame.createTempView("login")
    //写sql语句
    val frame = spark.sql(
      """
        |select
        |  uid,
        |  max(times) times,
        |  collect_list(start_date)[0] start_date,
        |  collect_list(end_date)[0] end_date
        |from
        |(
        |  select
        |    uid,
        |    sum(1) times, --连续登录的天数
        |    min(dt) start_date,
        |    max(dt) end_date
        |  from
        |  (
        |    select
        |      uid,
        |      dt,
        |      date_sub(dt,num) diff
        |    from
        |    (
        |      select
        |        uid,
        |        dt,
        |        row_number() over( partition by uid order by dt) num
        |      from
        |      (
        |        select
        |          distinct uid,dt
        |        from
        |          login
        |        order by
        |          uid,dt
        |      ) tmp_01
        |    ) tmp_02
        |  ) tmp_03
        |  group by
        |    uid,diff
        |  having
        |    sum(1) >= 3
        |  order by
        |    uid,times desc
        |) tmp_04
        |group by
        |  uid
        |order by
        |  uid;
        |""".stripMargin)
    frame.show()
    //frame.write.format("parquet").mode(SaveMode.ErrorIfExists).save("data/sparksql/par")
    sc.stop()
    spark.stop()
  }
}

sql语句的实现过程

-- 1、按照用户id,dt去重
select
  distinct uid,dt
from
  login
order by
  uid,dt;
  +------+----------+
|   uid|        dt|
+------+----------+
|guid01|2018-02-28|
|guid01|2018-03-01|
|guid01|2018-03-02|
|guid01|2018-03-04|
|guid01|2018-03-05|
|guid01|2018-03-06|
|guid01|2018-03-07|
|guid02|2018-03-01|
|guid02|2018-03-02|
|guid02|2018-03-03|
|guid02|2018-03-06|
|guid03|2018-03-06|
|guid03|2018-03-07|
|guid03|2018-03-09|
|guid03|2018-03-10|
|guid03|2018-03-11|
|guid04|2018-03-05|
|guid04|2018-03-06|
|guid04|2018-03-07|
|guid04|2018-03-09|
+------+----------+
-- 2、划分窗口,打行号
select
  uid,
  dt,
  row_number() over( partition by uid order by dt) num
from
(
  select
    distinct uid,dt
  from
    login
  order by
    uid,dt
) tmp_01;
+------+----------+---+
|   uid|        dt|num|
+------+----------+---+
|guid02|2018-03-01|  1|
|guid02|2018-03-02|  2|
|guid02|2018-03-03|  3|
|guid02|2018-03-06|  4|
|guid03|2018-03-06|  1|
|guid03|2018-03-07|  2|
|guid03|2018-03-09|  3|
|guid03|2018-03-10|  4|
|guid03|2018-03-11|  5|
|guid04|2018-03-05|  1|
|guid04|2018-03-06|  2|
|guid04|2018-03-07|  3|
|guid04|2018-03-09|  4|
|guid01|2018-02-28|  1|
|guid01|2018-03-01|  2|
|guid01|2018-03-02|  3|
|guid01|2018-03-04|  4|
|guid01|2018-03-05|  5|
|guid01|2018-03-06|  6|
|guid01|2018-03-07|  7|
+------+----------+---+

-- 3、利用data_sub函数,让dt减去num的值,连续登录差值是一样的

select
  uid,
  dt,
  date_sub(dt,num) diff
from
(
  select
    uid,
    dt,
    row_number() over( partition by uid order by dt) num
  from
  (
    select
      distinct uid,dt
    from
      login
    order by
      uid,dt
  ) tmp_01
) tmp_02;
+------+----------+----------+
|   uid|        dt|      diff|
+------+----------+----------+
|guid02|2018-03-01|2018-02-28|
|guid02|2018-03-02|2018-02-28|
|guid02|2018-03-03|2018-02-28|
|guid02|2018-03-06|2018-03-02|
|guid03|2018-03-06|2018-03-05|
|guid03|2018-03-07|2018-03-05|
|guid03|2018-03-09|2018-03-06|
|guid03|2018-03-10|2018-03-06|
|guid03|2018-03-11|2018-03-06|
|guid04|2018-03-05|2018-03-04|
|guid04|2018-03-06|2018-03-04|
|guid04|2018-03-07|2018-03-04|
|guid04|2018-03-09|2018-03-05|
|guid01|2018-02-28|2018-02-27|
|guid01|2018-03-01|2018-02-27|
|guid01|2018-03-02|2018-02-27|
|guid01|2018-03-04|2018-02-28|
|guid01|2018-03-05|2018-02-28|
|guid01|2018-03-06|2018-02-28|
|guid01|2018-03-07|2018-02-28|
+------+----------+----------+
-- 4、依据uid和diff进行分组,过滤连续登录三天以上的uid,并求出开始和结束时间
select
  uid,
  sum(1) times, --连续登录的天数
  min(dt) start_date,
  max(dt) end_date
from
(
  select
    uid,
    dt,
    date_sub(dt,num) diff
  from
  (
    select
      uid,
      dt,
      row_number() over( partition by uid order by dt) num
    from
    (
      select
        distinct uid,dt
      from
        login
      order by
        uid,dt
    ) tmp_01
  ) tmp_02
) tmp_03 
group by
  uid,diff
having 
  sum(1) >= 3
order by
  uid,times desc;
+------+-----+----------+----------+
|   uid|times|start_date|  end_date|
+------+-----+----------+----------+
|guid01|    4|2018-03-04|2018-03-07|
|guid01|    3|2018-02-28|2018-03-02|
|guid02|    3|2018-03-01|2018-03-03|
|guid03|    3|2018-03-09|2018-03-11|
|guid04|    3|2018-03-05|2018-03-07|
+------+-----+----------+----------+

-- 5、对uid再次进行分组,利用collect_list集合拿出第一个元素,就是最大的连续登录次数
select
  uid,
  max(times) times,
  collect_list(start_date)[0] start_date,
  collect_list(end_date)[0] end_date
from
(
  select
    uid,
    sum(1) times, --连续登录的天数
    min(dt) start_date,
    max(dt) end_date
  from
  (
    select
      uid,
      dt,
      date_sub(dt,num) diff
    from
    (
      select
        uid,
        dt,
        row_number() over( partition by uid order by dt) num
      from
      (
        select
          distinct uid,dt
        from
          login
        order by
          uid,dt
      ) tmp_01
    ) tmp_02
  ) tmp_03 
  group by
    uid,diff
  having 
    sum(1) >= 3
  order by
    uid,times desc
) tmp_04
group by
  uid  
order by 
  uid;
+------+-----+----------+----------+
|   uid|times|start_date|  end_date|
+------+-----+----------+----------+
|guid01|    4|2018-03-04|2018-03-07|
|guid02|    3|2018-03-01|2018-03-03|
|guid03|    3|2018-03-09|2018-03-11|
|guid04|    3|2018-03-05|2018-03-07|
+------+-----+----------+----------+
  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值