import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
object DSLContinuedLogin {
def main(args: Array[String]): Unit = {
val session= SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val df =session.read.option("header",true).csv("data/data1.txt")importsession.implicits._ //将字符串转成/**
* select 中的字段既有字符串,又有函数,需要把字符串转成Column,加 $ 或 左单引号 ‘
*/
val res: DataFrame = df.distinct().select(
$"uid",
$"dt",
row_number()over(Window.partitionBy($"uid").orderBy($"dt"))as("rn")// row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")).select(
$"uid",'dt,
date_sub('dt, 'rn)as("date_dif")// expr("date_sub(dt,rn) as date_dif") //可以传入sql语言中的一部分).groupBy("uid","date_dif").agg(count("*")as"times",min("dt")as"start_date",max("dt")as"end_date").where($"times">=3).drop("date_dif")
res.show()session.stop()
}
}
过程解析
1、
df.distinct().select(
$"uid",
$"dt",
row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")// row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn"))--执行结果
+------+----------+---+| uid| dt| rn|+------+----------+---+|guid02|2018-03-01|1||guid02|2018-03-02|2||guid02|2018-03-03|3||guid02|2018-03-06|4||guid03|2018-03-05|1||guid03|2018-03-06|2||guid03|2018-03-07|3||guid03|2018-03-08|4||guid03|2018-03-09|5||guid03|2018-03-10|6|2、
df.distinct().select(
$"uid",
$"dt",
row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")// row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")).select(
$"uid",'dt,
date_sub('dt, 'rn) as ("date_dif")// expr("date_sub(dt,rn) as date_dif"))--执行结果
+------+----------+----------+| uid| dt| date_dif|+------+----------+----------+|guid02|2018-03-01|2018-02-28||guid02|2018-03-02|2018-02-28||guid02|2018-03-03|2018-02-28||guid02|2018-03-06|2018-03-02||guid03|2018-03-05|2018-03-04||guid03|2018-03-06|2018-03-04||guid03|2018-03-07|2018-03-04||guid03|2018-03-08|2018-03-04||guid03|2018-03-09|2018-03-04|3、
df.distinct().select(
$"uid",
$"dt",
row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")// row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")).select(
$"uid",'dt,
date_sub('dt, 'rn) as ("date_dif")// expr("date_sub(dt,rn) as date_dif")).groupBy("uid","date_dif").agg(
count("*") as "times",
min("dt") as "start_date",
max("dt") as "end_date").where($"times">=3)--执行结果
+------+----------+-----+----------+----------+| uid| date_dif|times|start_date| end_date|+------+----------+-----+----------+----------+|guid02|2018-02-28|3|2018-03-01|2018-03-03||guid03|2018-03-04|6|2018-03-05|2018-03-10||guid04|2018-03-05|3|2018-03-08|2018-03-10||guid01|2018-02-27|3|2018-02-28|2018-03-02||guid01|2018-02-28|4|2018-03-04|2018-03-07|+------+----------+-----+----------+----------+4、
df.distinct().select(
$"uid",
$"dt",
row_number() over (Window.partitionBy($"uid").orderBy($"dt")) as ("rn")// row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")).select(
$"uid",'dt,
date_sub('dt, 'rn) as ("date_dif")// expr("date_sub(dt,rn) as date_dif") //可以传入sql语言中的一部分).groupBy("uid","date_dif").agg(
count("*") as "times",
min("dt") as "start_date",
max("dt") as "end_date").where($"times">=3).drop("date_dif")+------+-----+----------+----------+| uid|times|start_date| end_date|+------+-----+----------+----------+|guid02|3|2018-03-01|2018-03-03||guid03|6|2018-03-05|2018-03-10||guid04|3|2018-03-08|2018-03-10||guid01|3|2018-02-28|2018-03-02||guid01|4|2018-03-04|2018-03-07|+------+-----+----------+----------+