SparkSQL案例——用SQL和DSL两种语法格式，求出用户连续登录天数

本文链接：https://blog.csdn.net/weixin_42419342/article/details/108918139

实验数据：

uid,dt
guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-05
guid01,2018-03-02
guid01,2018-03-04
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-03
guid02,2018-03-02
guid02,2018-03-06

1.SQL风格

注意：

读取数据：针对不同的文件格式读取数据，获得df
创建视图：使用df创建视图，df.createTempView("view_name")
针对视图进行后续查询操作
""""""：可使用多行字符串书写方式（连续按三次英文模式下的双引号，再回车）
解析数据：由于存在表头，所以使用option函数将第一行表头解析为字段名，作为DF的Schema信息，从第二行开始读取数据

object ContinueLoginSQL {

  def main(args: Array[String]): Unit = {

    //此处使用自己的Utils创建SparkSession
    val spark: SparkSession = Utils.getSparkSession(this.getClass.getSimpleName)

    //读取数据
    val df: DataFrame = spark.read.option("header", true).csv("C:\\Users\\logan\\Desktop\\testData\\作业-连续登录3天.csv")

    //SQL风格
    //创建视图
    df.createTempView("v_uid_dt")

    //执行sql语句
    val df2: DataFrame = spark.sql(
      """
        |
        |select
        |  uid,
        |  start_dt,
        |  end_dt,
        |  counts
        |from
        |(
        |  select
        |    uid,
        |    min(dt) start_dt,
        |    max(dt) end_dt,
        |    count(date_diff) counts
        |  from
        |  (
        |    select
        |      uid,
        |      dt,
        |      date_sub(dt,flag) date_diff
        |    from
        |    (
        |      select
        |        uid,
        |        dt,
        |        row_number() over(partition by uid order by dt) flag
        |      from
        |        v_uid_dt
        |    )
        |  )
        |  group by
        |    uid,
        |    date_diff
        |)
        |where
        |  counts >= 3
        |
        |""".stripMargin)

    df2.show()
    spark.stop()
    
  }

}

SQL过程步骤详解：

uid,dt
guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-05
guid01,2018-03-02
guid01,2018-03-04
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-03
guid02,2018-03-02
guid02,2018-03-06


-- 1 分组排序，再打行号

select 
  uid,dt,
  row_number() over(partition by uid order by dt) flag
from
  v_uid_dt


+------+----------+----+
|   uid|        dt|flag|
+------+----------+----+
|guid02|2018-03-01|   1|
|guid02|2018-03-02|   2|
|guid02|2018-03-03|   3|
|guid02|2018-03-06|   4|
|guid01|2018-02-28|   1|
|guid01|2018-03-01|   2|
|guid01|2018-03-02|   3|
|guid01|2018-03-04|   4|
|guid01|2018-03-05|   5|
|guid01|2018-03-06|   6|
|guid01|2018-03-07|   7|
+------+----------+----+


-- 2.  求出间隔

select
  uid,
  dt,
  date_sub(dt,flag) date_diff
from
(
  select 
    uid,
    dt,
    row_number() over(partition by uid order by dt) flag
  from
    v_uid_dt
)


+------+----------+----------+
|   uid|        dt| date_diff|
+------+----------+----------+
|guid02|2018-03-01|2018-02-28|
|guid02|2018-03-02|2018-02-28|
|guid02|2018-03-03|2018-02-28|
|guid02|2018-03-06|2018-03-02|
|guid01|2018-02-28|2018-02-27|
|guid01|2018-03-01|2018-02-27|
|guid01|2018-03-02|2018-02-27|
|guid01|2018-03-04|2018-02-28|
|guid01|2018-03-05|2018-02-28|
|guid01|2018-03-06|2018-02-28|
|guid01|2018-03-07|2018-02-28|
+------+----------+----------+


--  3  按照uid,date_diff 分组,取出开始日期和结束日期，求出连续登录的天数


select
  uid,
  min(dt) start_dt,
  max(dt) end_dt,
  count(date_diff) counts
from
(
  select
    uid,
    dt,
    date_sub(dt,flag) date_diff
  from
  (
    select 
      uid,
      dt,
      row_number() over(partition by uid order by dt) flag
    from
      v_uid_dt
  )
)
group by
  uid,
  date_diff

+------+----------+----------+------+
|   uid|  start_dt|    end_dt|counts|
+------+----------+----------+------+
|guid02|2018-03-01|2018-03-03|     3|
|guid02|2018-03-06|2018-03-06|     1|
|guid01|2018-02-28|2018-03-02|     3|
|guid01|2018-03-04|2018-03-07|     4|
+------+----------+----------+------+


-- 4. 过滤，求出连续登录超过 3 天的记录

select
  uid,
  start_dt,
  end_dt,
  counts
from
(
  select
    uid,
    min(dt) start_dt,
    max(dt) end_dt,
    count(date_diff) counts
  from
  (
    select
      uid,
      dt,
      date_sub(dt,flag) date_diff
    from
    (
      select 
        uid,
        dt,
        row_number() over(partition by uid order by dt) flag
      from
        v_uid_dt
    )
  )
  group by
    uid,
    date_diff
)
where
  counts >= 3


+------+----------+----------+------+
|   uid|  start_dt|    end_dt|counts|
+------+----------+----------+------+
|guid02|2018-03-01|2018-03-03|     3|
|guid01|2018-02-28|2018-03-02|     3|
|guid01|2018-03-04|2018-03-07|     4|
+------+----------+----------+------+

2.DSL风格

注意：

导入函数：使用DSL风格的语句时，需要用到很多函数，import org.apache.spark.sql.functions._
导入SparkSession的隐式转换：import spark.implicits._
对于某些无法使用的函数，可能会用到expr(sql)分片代替，是一种混搭的写法
在DSL风格下调用窗口函数时，要用Window

object ContinueLoginDSL {

  def main(args: Array[String]): Unit = {
    //创建SparkSession
    val spark: SparkSession = Utils.getSparkSession(this.getClass.getSimpleName)

    //读取数据
    val df: DataFrame = spark.read.option("header", true).csv("C:\\Users\\logan\\Desktop\\testData\\作业-连续登录3天.csv")

    //DSL风格
    //不需要创建视图,直接调用方法
    import spark.implicits._
    import org.apache.spark.sql.functions._
    //先取出 uid，dt这两列，然后分组排序，再给每条数据打标记
    df.select(
      $"uid",
      $"dt",
      row_number() over(Window.partitionBy("uid").orderBy("dt")) as "flag"
    ).select(
      //这里再调用select，相当于sql语句中在外面又套了一层select，是一个子查询
      //使用函数date_sub 求出日期差值
      $"uid",
      $"dt",
//      date_sub($"dt",$"flag") as "date_diff"
      //在低版本的spark中，上述 date_sub函数不可用，此时要使用 expr
      expr("date_sub(dt,flag) as date_diff")
    ).groupBy("uid","date_diff")  //groupBy分组，先按照uid分组，再按照 date_diff分组
      .agg(    //使用agg执行聚合操作
        min("dt") as "start_dt",    //起始时间
        max("dt") as "end_dt",      //结束时间
        count("*") as "counts"      //持续天数
      ).where($"counts" >= 3)          //过滤出连续登录天数 >= 3 的记录
      .select(   //筛选出想要展示的数据
        "uid",
        "start_dt",
        "end_dt",
        "counts"
      ).show()

    spark.stop()
  }

}

个人认为DSL风格的语法更适合程序员的思维，熟练使用后也很方便