uid,dt guid01,2018-02-28 guid01,2018-02-28 guid01,2018-03-01 guid01,2018-03-02 guid01,2018-03-05 guid01,2018-03-05 guid01,2018-03-04 guid01,2018-03-06 guid01,2018-03-07 guid02,2018-03-01 guid02,2018-03-02 guid02,2018-03-03 guid02,2018-03-06 guid02,2018-03-02 guid02,2018-03-03 guid02,2018-03-06
需求分析过程
---先去除表中重复数据
select
distinct(uid,dt),
uid,
dt
from
t_user+------------------------------+------+----------+
|named_struct(uid, uid, dt, dt)| uid| dt|
+------------------------------+------+----------+
| [guid02, 2018-03-02]|guid02|2018-03-02|
| [guid01, 2018-03-06]|guid01|2018-03-06|
| [guid01, 2018-03-01]|guid01|2018-03-01|
| [guid01, 2018-02-28]|guid01|2018-02-28|
| [guid02, 2018-03-01]|guid02|2018-03-01|
| [guid01, 2018-03-02]|guid01|2018-03-02|
| [guid01, 2018-03-05]|guid01|2018-03-05|
| [guid01, 2018-03-07]|guid01|2018-03-07|
| [guid02, 2018-03-06]|guid02|2018-03-06|
| [guid02, 2018-03-03]|guid02|2018-03-03|
| [guid01, 2018-03-04]|guid01|2018-03-04|
+------------------------------+------+----------+--使用窗口函数添加有序数字
select
uid,
dt,
row_number() over(partition by uid order by dt) rn
from
(
select
distinct(uid,dt),
uid,
dt
from
t_user)t1+------+----------+---+
| uid| dt| rn|
+------+----------+---+
|guid02|2018-03-01| 1|
|guid02|2018-03-02| 2|
|guid02|2018-03-03| 3|
|guid02|2018-03-06| 4|
|guid01|2018-02-28| 1|
|guid01|2018-03-01| 2|
|guid01|2018-03-02| 3|
|guid01|2018-03-04| 4|
|guid01|2018-03-05| 5|
|guid01|2018-03-06| 6|
|guid01|2018-03-07| 7|
+------+----------+---+--将dt与rn相减生成新的窗口
select
uid,
dt,
date_sub(dt,rn) data_dif --日期和行号的差值
from
(
select
uid,
dt,
row_number() over(partition by uid order by dt) rn
from
(
select
distinct(uid,dt),
uid,
dt
from
t_user)t1
)t2+------+----------+----------+
| uid| dt| data_dif|
+------+----------+----------+
|guid02|2018-03-01|2018-02-28|
|guid02|2018-03-02|2018-02-28|
|guid02|2018-03-03|2018-02-28|
|guid02|2018-03-06|2018-03-02|
|guid01|2018-02-28|2018-02-27|
|guid01|2018-03-01|2018-02-27|
|guid01|2018-03-02|2018-02-27|
|guid01|2018-03-04|2018-02-28|
|guid01|2018-03-05|2018-02-28|
|guid01|2018-03-06|2018-02-28|
|guid01|2018-03-07|2018-02-28|
+------+----------+----------+--根据uid和data_dif分组 统计连续登录的天数,并求出登录的第一天和最后一天
select
uid,
min(dt) start_time,
max(dt) end_time,
count(*) times
from
(
select
uid,
dt,
date_sub(dt,rn) data_dif --日期和行号的差值
from
(
select
uid,
dt,
row_number() over(partition by uid order by dt) rn
from
(
select
distinct(uid,dt),
uid,
dt
from
t_user)t1
)t2
)t3
group by uid,data_dif having times>=3
+------+----------+----------+-----+
| uid|start_time| end_time|times|
+------+----------+----------+-----+
|guid02|2018-03-01|2018-03-03| 3|
|guid01|2018-02-28|2018-03-02| 3|
|guid01|2018-03-04|2018-03-07| 4|
+------+----------+----------+-----+
1.使用SQL
import org.apache.spark.sql.SparkSession
object SQLContinuedLogin {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df1 = spark.read
.option("header", "true")
.csv("data/data1.txt")
df1.createTempView("v_user_login")
spark.sql(
s"""
|
|select
| uid,
| min(dt) start_date,
| max(dt) end_date,
| count(*) times
|from
|(
| select
| uid,
| dt,
| date_sub(dt, rn) date_dif -- 日期和行号的差值
| from
| (
| select
| uid,
| dt,
| row_number() over(partition by uid order by dt asc) rn
| from
| (
| select
| distinct(uid, dt), uid, dt
| from
| v_user_login
| )
| )
|) group by uid, date_dif having times >= 3
|
|""".stripMargin).show()
}
}
2.使用DSL
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
object DSLContinuedLogin {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df1 = spark.read
.option("header", "true")
.csv("data/data1.txt")
import spark.implicits._
import org.apache.spark.sql.functions._
val res = df1.distinct()
.select(
$"uid",
$"dt",
//row_number() over(Window.partitionBy("uid").orderBy("dt")) as("rn")
row_number().over(Window.partitionBy("uid").orderBy("dt")).as("rn")
).select(
$"uid",
'dt,
//date_sub('dt, 'rn) as "date_dif" //spark 2.2不支持, spark3.x支持
expr("date_sub(dt, rn) as date_dif")
).groupBy(
"uid", "date_dif"
).agg(
count("*") as "times",
min("dt") as "start_date",
max("dt") as "end_date"
).where($"times" >= 3)
.drop("date_dif")
res.show()
spark.stop()
}
}