案例 求连续登录三天以上的用户--SparkSQL实现(DSL风格)

部分数据展示
uid,dt
guid01,2018-03-01
guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-04
guid01,2018-03-02
guid01,2018-03-05
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-02
guid02,2018-03-03
guid02,2018-03-06
...
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._

object DSLContinuedLogin {
  def main(args: Array[String]): Unit = {
    val session = SparkSession.builder()
      .appName(this.getClass.getSimpleName)
      .master("local[*]")
      .getOrCreate()

    val df = session.read
      .option("header",true)
      .csv("data/data1.txt")

    import session.implicits._  //将字符串转成

    /**
     * select 中的字段既有字符串,又有函数,需要把字符串转成Column,加 $ 或 左单引号 ‘
     */
    val res: DataFrame = df.distinct()
      .select(
        $"uid",
        $"dt",
        row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")
        //        row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")
      ).select(
      $"uid",
      'dt,
      date_sub('dt, 'rn) as ("date_dif")
//      expr("date_sub(dt,rn) as date_dif") //可以传入sql语言中的一部分
    ).groupBy("uid", "date_dif")
      .agg(
        count("*") as "times",
        min("dt") as "start_date",
        max("dt") as "end_date"
      ).where($"times" >= 3)
      .drop("date_dif")


    res.show()
    session.stop()


  }

}

过程解析

1、
  df.distinct()
      .select(
        $"uid",
        $"dt",
        row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")
        //        row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")
      )
--执行结果
+------+----------+---+
|   uid|        dt| rn|
+------+----------+---+
|guid02|2018-03-01|  1|
|guid02|2018-03-02|  2|
|guid02|2018-03-03|  3|
|guid02|2018-03-06|  4|
|guid03|2018-03-05|  1|
|guid03|2018-03-06|  2|
|guid03|2018-03-07|  3|
|guid03|2018-03-08|  4|
|guid03|2018-03-09|  5|
|guid03|2018-03-10|  6|

2、
  df.distinct()
      .select(
        $"uid",
        $"dt",
        row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")
        //        row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")
      ).select(
      $"uid",
      'dt,
      date_sub('dt, 'rn) as ("date_dif")
//      expr("date_sub(dt,rn) as date_dif")
      )
--执行结果
+------+----------+----------+
|   uid|        dt|  date_dif|
+------+----------+----------+
|guid02|2018-03-01|2018-02-28|
|guid02|2018-03-02|2018-02-28|
|guid02|2018-03-03|2018-02-28|
|guid02|2018-03-06|2018-03-02|
|guid03|2018-03-05|2018-03-04|
|guid03|2018-03-06|2018-03-04|
|guid03|2018-03-07|2018-03-04|
|guid03|2018-03-08|2018-03-04|
|guid03|2018-03-09|2018-03-04|

3、  
   df.distinct()
      .select(
        $"uid",
        $"dt",
        row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")
        //        row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")
      ).select(
      $"uid",
      'dt,
      date_sub('dt, 'rn) as ("date_dif")
//      expr("date_sub(dt,rn) as date_dif")
    ).groupBy("uid", "date_dif")
      .agg(
        count("*") as "times",
        min("dt") as "start_date",
        max("dt") as "end_date"
      ).where($"times" >= 3)  
--执行结果
+------+----------+-----+----------+----------+
|   uid|  date_dif|times|start_date|  end_date|
+------+----------+-----+----------+----------+
|guid02|2018-02-28|    3|2018-03-01|2018-03-03|
|guid03|2018-03-04|    6|2018-03-05|2018-03-10|
|guid04|2018-03-05|    3|2018-03-08|2018-03-10|
|guid01|2018-02-27|    3|2018-02-28|2018-03-02|
|guid01|2018-02-28|    4|2018-03-04|2018-03-07|
+------+----------+-----+----------+----------+

4、
	df.distinct()
      .select(
        $"uid",
        $"dt",
        row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")
        //        row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")
      ).select(
      $"uid",
      'dt,
      date_sub('dt, 'rn) as ("date_dif")
//      expr("date_sub(dt,rn) as date_dif") //可以传入sql语言中的一部分
    ).groupBy("uid", "date_dif")
      .agg(
        count("*") as "times",
        min("dt") as "start_date",
        max("dt") as "end_date"
      ).where($"times" >= 3)
      .drop("date_dif")
  
+------+-----+----------+----------+
|   uid|times|start_date|  end_date|
+------+-----+----------+----------+
|guid02|    3|2018-03-01|2018-03-03|
|guid03|    6|2018-03-05|2018-03-10|
|guid04|    3|2018-03-08|2018-03-10|
|guid01|    3|2018-02-28|2018-03-02|
|guid01|    4|2018-03-04|2018-03-07|
+------+-----+----------+----------+  
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值