uid,dt
guid01,2018-02-28
guid01,2018-03-01
guid01,2018-03-02
guid01,2018-03-05
guid01,2018-03-04
guid01,2018-03-06
guid01,2018-03-07
guid02,2018-03-01
guid02,2018-03-02
guid02,2018-03-03
guid02,2018-03-06
需求计算连续登录三天及以上的用户
select
--要把uid,和时间都去重因为有的客户一天会登录好几次
distinct(uid,dt) dis
from
v_user_login
上一步运行的结果
| dis|
+--------------------+
|[guid02, 2018-03-02]|
|[guid01, 2018-03-01]|
|[guid01, 2018-03-06]|
|[guid01, 2018-03-07]|
select
--把生成的一个字段提取出来取别名
t1.dis.uid uid,
t1.dis.dt dt
from(
select
--要把uid,和时间都去重因为有的客户一天会登录好几次
distinct(uid,dt) dis
from
v_user_login
)t1
划分窗口
select
uid,
dt,
--对uid进行窗口对dt进行升序 row_number() over固定格式后面分组排序前面就必须这么写
row_number() over(partition by uid order by dt asc) rn
from(
select
--把生成的一个字段提取出来取别名
t1.dis.uid uid,
t1.dis.dt dt
from(
select
--要把uid,和时间都去重因为有的客户一天会登录好几次
distinct(uid,dt) dis
from
v_user_login
)t1
)
上一步的执行结果
| uid| dt| rn|
+------+----------+---+
|guid02|2018-03-01| 1|
|guid02|2018-03-02| 2|
|guid02|2018-03-03| 3|
|guid02|2018-03-06| 4|
日期减去行号
select
uid,
dt,
-- 这边用到了一个函数date_sub() 括号里写日期和要减的只就会相减
date_sub(dt,rn) date_dif -- 日期和行号的差值
from(
select
uid,
dt,
--对uid进行窗口对dt进行升序 row_number() over固定格式后面分组排序前面就必须这么写
row_number() over(partition by uid order by dt asc) rn
from(
select
--把生成的一个字段提取出来取别名
t1.dis.uid uid,
t1.dis.dt dt
from(
select
--要把uid,和时间都去重因为有的客户一天会登录好几次
distinct(uid,dt) dis
from
v_user_login
)t1
)
)
上一步结果
| uid| dt| date_dif|
+------+----------+----------+
|guid02|2018-03-01|2018-02-28|
|guid02|2018-03-02|2018-02-28|
|guid02|2018-03-03|2018-02-28|
|guid02|2018-03-06|2018-03-02|
select
uid,
min(dt) start_date,-- 最大的日期和最小的日期展示就是连续登录的第一天到最后一天
max(dt) end_date,
count(*) times
from(
select
uid,
dt,
-- 这边用到了一个函数date_sub() 括号里写日期和要减的只就会相减
date_sub(dt,rn) date_dif -- 日期和行号的差值
from(
select
uid,
dt,
--对uid进行窗口对dt进行升序 row_number() over固定格式后面分组排序前面就必须这么写
row_number() over(partition by uid order by dt asc) rn
from(
select
--把生成的一个字段提取出来取别名
t1.dis.uid uid,
t1.dis.dt dt
from(
select
--要把uid,和时间都去重因为有的客户一天会登录好几次
distinct(uid,dt) dis
from
v_user_login
)t1
)
)--根据uid和date_dif分组 条件times>=3
)group by uid ,date_dif having times >= 3
最后写在idea的成品代码
import org.apache.spark.sql.SparkSession
object SQLContinuedLogin {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df1 = spark.read
.option("header", "true")
.csv("D:\\ideawork\\spark\\src\\main\\scala\\cn\\_51doit\\demo05DateFrame\\data1.txt")
//视图就是给查询过的表格起个名子,这样才好操作表写sql
df1.createTempView("v_user_login")
spark.sql(
s"""
select
|uid,
|min(dt) start_date,-- 最大的日期和最小的日期展示就是连续登录的第一天到最后一天
|max(dt) end_date,
|count(*) times
|from(
| select
| uid,
| dt,
| -- 这边用到了一个函数date_sub() 括号里写日期和要减的只就会相减
| date_sub(dt,rn) date_dif -- 日期和行号的差值
| from(
| select
| uid,
| dt,
| --对uid进行窗口对dt进行升序 row_number() over固定格式后面分组排序前面就必须这么写.打行号
| row_number() over(partition by uid order by dt asc) rn
| from(
| select
| --把生成的一个字段提取出来取别名
| t1.dis.uid uid,
| t1.dis.dt dt
| from(
| select
| --要把uid,和时间都去重因为有的客户一天会登录好几次
| distinct(uid,dt) dis
| from
| v_user_login
| )t1
| )
| )--根据uid和date_dif分组 条件times>=3
|)group by uid ,date_dif having times >= 3
|""".stripMargin).show()
}
}
用DSL风格写同样的题目
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
object DSLContinuedLogin {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df1 = spark.read
.option("header", "true")
.csv("D:\\ideawork\\spark\\src\\main\\scala\\cn\\_51doit\\demo05DateFrame\\data1.txt")
import spark.implicits._ //导包导一个隐式转换的和一个sql中所有方法的
import org.apache.spark.sql.functions._
val res = df1.distinct()
.select(
//'飘(就是半个单引号)只能在select的括号里用
'uid,
'dt,
row_number().over(Window.partitionBy('uid).orderBy('dt)) as('rn)
).select(
'uid,
'dt,
//这里的expr就是sql语句可以写这里,如果用到sql方法DSL里没有就可以用expr包一下
expr("date_sub(dt,rn) as date_dif")
).groupBy(
"uid","date_dif"
).agg(//agg是对列进行操作比如这边count聚合后有一列数据可以去最大最小值
count("*") as "times",
min("dt") as "start_date",
max("dt") as "end_date"
).where($"times" >= 3)
.drop("date_dif")
res.show()
spark.stop()
}
}
另一个案例
sid,dt,money
shop1,2019-1-18,500
shop1,2019-2-10,500
shop1,2019-2-10,200
shop1,2019-2-11,600
shop1,2019-2-12,400
shop1,2019-2-13,200
shop1,2019-2-15,100
shop1,2019-3-5,180
shop1,2019-4-5,280
shop1,2019-4-6,220
shop2,2019-2-10,100
shop2,2019-2-11,100
shop2,2019-2-13,100
shop2,2019-3-15,100
shop2,2019-4-15,100
--第一步,将日期转成年月,将金额转成double
select
sid,
--substr()是一个截取字符串的方法参数一是字段,参数二是从哪一个索引开始截取,参数三是截几个
substr(dt,0,7) mth,
--cast(as)也是一个方法as前面是字段后面是要转成的数据类型
cast(money as double) money
from v_stop
--上一步执行结果
+-----+-------+-----+
| sid| mth|money|
+-----+-------+-----+
|shop1|2019-01|500.0|
|shop1|2019-02|500.0|
|shop1|2019-02|200.0|
|shop1|2019-02|600.0|
|shop1|2019-02|400.0|
|shop1|2019-02|200.0|
|shop1|2019-02|100.0|
|shop1|2019-03|180.0|
|shop1|2019-04|280.0|
|shop1|2019-04|220.0|
|shop2|2019-02|100.0|
|shop2|2019-02|100.0|
|shop2|2019-02|100.0|
|shop2|2019-03|100.0|
|shop2|2019-04|100.0|
+-----+-------+-----+
--对sid相同和mth相同的进行聚合就得出每个客户每天的金额总和
select
sid,
mth,
sum(money) mth_income
from(
select
sid,
--substr()是一个截取字符串的方法参数一是字段,参数二是从哪一个索引开始截取,参数三是截几个
substr(dt,0,7) mth,
--cast(as)也是一个方法as前面是字段后面是要转成的数据类型
cast(money as double) money
from v_stop
)t1 group by sid , mth
--上一次执行的结果
+-----+-------+----------+
| sid| mth|mth_income|
+-----+-------+----------+
|shop2|2019-04| 100.0|
|shop2|2019-03| 100.0|
|shop1|2019-01| 500.0|
|shop1|2019-03| 180.0|
|shop2|2019-02| 300.0|
|shop1|2019-04| 500.0|
|shop1|2019-02| 2000.0|
+-----+-------+----------+
select
sid,
mth,
--开sid窗口时间排序累加money开头行累加到当前行
sum(mth_income) over(partition by sid order by mth rows between unbounded preceding and current row) total_money
from(
select
sid,
mth,
sum(money) mth_income
from(
select
sid,
--substr()是一个截取字符串的方法参数一是字段,参数二是从哪一个索引开始截取,参数三是截几个
substr(dt,0,7) mth,
--cast(as)也是一个方法as前面是字段后面是要转成的数据类型
cast(money as double) money
from v_stop
)t1 group by sid , mth
)t2
+-----+-------+-----------+
| sid| mth|total_money|
+-----+-------+-----------+
|shop2|2019-02| 300.0|
|shop2|2019-03| 400.0|
|shop2|2019-04| 500.0|
|shop1|2019-01| 500.0|
|shop1|2019-02| 2500.0|
|shop1|2019-03| 2680.0|
|shop1|2019-04| 3180.0|
+-----+-------+-----------+
在idea里的最后成品代码
import org.apache.spark.sql.SparkSession
object SQLShopIncomeDemo{
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df = spark.read
.option("header", "true")
.csv("D:\\ideawork\\spark\\src\\main\\scala\\cn\\_51doit\\demo05DateFrame\\shop.csv")
df.createTempView("v_stop")
spark.sql(
s"""
select
|sid,
|mth,
|--开sid窗口时间排序累加money开头行累加到当前行
|sum(mth_income) over(partition by sid order by mth rows between unbounded preceding and current row) total_money
|from(
| select
| sid,
| mth,
| sum(money) mth_income
| from(
| select
| sid,
| --substr()是一个截取字符串的方法参数一是字段,参数二是从哪一个索引开始截取,参数三是截几个
| substr(dt,0,7) mth,
| --cast(as)也是一个方法as前面是字段后面是要转成的数据类型
| cast(money as double) money
| from v_stop
| )t1 group by sid , mth
|)t2
|
|""".stripMargin).show()
spark.stop()
}
}
DSL风格写的代码
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.types.DataTypes
object DSLShopIncomeDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df = spark.read
.option("header", "true")
.csv("D:\\ideawork\\spark\\src\\main\\scala\\cn\\_51doit\\demo05DateFrame\\shop.csv")
import spark.implicits._
import org.apache.spark.sql.functions._
df.select(
'sid,
//吧字符串转化成日期且保留年月
date_format('dt,"yyyy-MM") as 'mth,
//将money这个字段调用方法转成double类型 '就是把他变成字段
'money.cast(DataTypes.DoubleType) as 'money
).groupBy(
'sid,
$"mth"
//将money加起来重命名
).sum("money").withColumnRenamed(
"sum(money)",
"mth_money"
).select(
'sid,
'mth,
'mth_money,
//开窗口sid,根据mth排序 累加开头行到当前行
sum("mth_money") over(Window.partitionBy("sid").orderBy("mth").rowsBetween(Window.unboundedPreceding,Window.currentRow)) as "total_money"
).show()
spark.stop()
// val res: DataFrame = df.map(row => {
// //获取每一行读出来的字符串,根据索引
// val sid = row.getString(0)
// val dt = row.getString(1)
// //吧获取到的字符串从0开始取7个
// val mth = dt.substring(0, 7)
// //吧获取到的字符串转成Double类型
// val monye = row.getString(2).toDouble
// //返回
// (sid, mth, monye)
// }).toDF("sid", "mth", "money")//吧返回的元组类型的转成DataFrame类型字段改名
// res.show()
// res.printSchema()
}
}
## ```DSL风格写WOrdCount
```java
import org.apache.spark.sql.{Dataset, SparkSession}
object DataSetWordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val lines = spark.read.textFile("D:\\bbb\\data\\words.txt")
//导入spaek函数
import spark.implicits._
val words: Dataset[String] = lines.flatMap(e => e.split(" "))
val res = words.groupBy("value").count().orderBy($"count".desc).withColumnRenamed("count", "counts")
res.collect()
res.show()
res.printSchema()
}
}
上网流量案例10分钟以内算一次上网
uid,start_time,end_time,flow
1,2020-02-18 14:20:30,2020-02-18 14:46:30,20
1,2020-02-18 14:47:20,2020-02-18 15:20:30,30
1,2020-02-18 15:37:23,2020-02-18 16:05:26,40
1,2020-02-18 16:06:27,2020-02-18 17:20:49,50
1,2020-02-18 17:21:50,2020-02-18 18:03:27,60
2,2020-02-18 14:18:24,2020-02-18 15:01:40,20
2,2020-02-18 15:20:49,2020-02-18 15:30:24,30
2,2020-02-18 16:01:23,2020-02-18 16:40:32,40
2,2020-02-18 16:44:56,2020-02-18 17:40:52,50
3,2020-02-18 14:39:58,2020-02-18 15:35:53,20
3,2020-02-18 15:36:39,2020-02-18 15:24:54,30
select
uid,
start_time,
end_time,
flow,
--将数据按照uid划分窗口,将end_time向后压一行
lag(end_time,1) over(partition by uid order by start_time) lag_time
from v_flow
select
uid,
start_time,
end_time,
flow,
--将start_time减end_time求出时间间隔大于10返回1不大于10返回0要把时间转成时间戳才能计算
if((to_unix_timestamp(start_time) - to_unix_timestamp(lag_time)) / 60 >10 ,1,0) flag
from(
select
uid,
start_time,
end_time,
flow,
--将数据按照uid划分窗口,将end_time向后压一行
lag(end_time,1) over(partition by uid order by start_time) lag_time
from v_flow
)t1
+---+-------------------+-------------------+----+----+
|uid| start_time| end_time|flow|flag|
+---+-------------------+-------------------+----+----+
| 3|2020-02-18 14:39:58|2020-02-18 15:35:53| 20| 0|
| 3|2020-02-18 15:36:39|2020-02-18 15:24:54| 30| 0|
| 1|2020-02-18 14:20:30|2020-02-18 14:46:30| 20| 0|
| 1|2020-02-18 14:47:20|2020-02-18 15:20:30| 30| 0|
| 1|2020-02-18 15:37:23|2020-02-18 16:05:26| 40| 1|
| 1|2020-02-18 16:06:27|2020-02-18 17:20:49| 50| 0|
| 1|2020-02-18 17:21:50|2020-02-18 18:03:27| 60| 0|
| 2|2020-02-18 14:18:24|2020-02-18 15:01:40| 20| 0| 0
| 2|2020-02-18 15:20:49|2020-02-18 15:30:24| 30| 1| 1
| 2|2020-02-18 16:01:23|2020-02-18 16:40:32| 40| 1| 2
| 2|2020-02-18 16:44:56|2020-02-18 17:40:52| 50| 0| 2
+---+-------------------+-------------------+----+----+
select
uid,
start_time,
end_time,
flow,
--将uid分组start_time排序,开fiag窗口做累加,如果值相等就可以当作一次上网时间
sum(flag) over(partition by uid order by start_time asc) sum_flag
from(
select
uid,
start_time,
end_time,
flow,
--将start_time减end_time求出时间间隔大于10返回1不大于10返回0要把时间转成时间戳才能计算
if((to_unix_timestamp(start_time) - to_unix_timestamp(lag_time)) / 60 >10 ,1,0) flag
from(
select
uid,
start_time,
end_time,
flow,
--将数据按照uid划分窗口,将end_time向后压一行
lag(end_time,1) over(partition by uid order by start_time) lag_time
from v_flow
)t1
)t2
+---+-------------------+-------------------+----+--------+
|uid| start_time| end_time|flow|sum_flag|
+---+-------------------+-------------------+----+--------+
| 3|2020-02-18 14:39:58|2020-02-18 15:35:53| 20| 0|
| 3|2020-02-18 15:36:39|2020-02-18 15:24:54| 30| 0|
| 1|2020-02-18 14:20:30|2020-02-18 14:46:30| 20| 0|
| 1|2020-02-18 14:47:20|2020-02-18 15:20:30| 30| 0|
| 1|2020-02-18 15:37:23|2020-02-18 16:05:26| 40| 1|
| 1|2020-02-18 16:06:27|2020-02-18 17:20:49| 50| 1|
| 1|2020-02-18 17:21:50|2020-02-18 18:03:27| 60| 1|
| 2|2020-02-18 14:18:24|2020-02-18 15:01:40| 20| 0|
| 2|2020-02-18 15:20:49|2020-02-18 15:30:24| 30| 1|
| 2|2020-02-18 16:01:23|2020-02-18 16:40:32| 40| 2|
| 2|2020-02-18 16:44:56|2020-02-18 17:40:52| 50| 2|
+---+-------------------+-------------------+----+--------+
select
uid,
min(start_time) start_time,
max(end_time) end_time,
sum(flow) total_flow
from(
select
uid,
start_time,
end_time,
flow,
--将uid分组start_time排序,开fiag窗口做累加,如果值相等就可以当作一次上网时间
sum(flag) over(partition by uid order by start_time asc) sum_flag
from(
select
uid,
start_time,
end_time,
flow,
--将start_time减end_time求出时间间隔大于10返回1不大于10返回0要把时间转成时间戳才能计算
if((to_unix_timestamp(start_time) - to_unix_timestamp(lag_time)) / 60 >10 ,1,0) flag
from(
select
uid,
start_time,
end_time,
flow,
--将数据按照uid划分窗口,将end_time向后压一行
lag(end_time,1) over(partition by uid order by start_time) lag_time
from v_flow
)t1
)t2
)t3 group by uid ,sum_flag
+---+-------------------+-------------------+----------+
|uid| start_time| end_time|total_flow|
+---+-------------------+-------------------+----------+
| 3|2020-02-18 14:39:58|2020-02-18 15:35:53| 50.0|
| 1|2020-02-18 14:20:30|2020-02-18 15:20:30| 50.0|
| 1|2020-02-18 15:37:23|2020-02-18 18:03:27| 150.0|
| 2|2020-02-18 14:18:24|2020-02-18 15:01:40| 20.0|
| 2|2020-02-18 15:20:49|2020-02-18 15:30:24| 30.0|
| 2|2020-02-18 16:01:23|2020-02-18 17:40:52| 90.0|
+---+-------------------+-------------------+----------+
在idea里的完整代码
import org.apache.spark.sql.SparkSession
object SQLFlowRollupDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.appName(this.getClass.getSimpleName)
.master("local[*]")
.getOrCreate()
val df = spark.read
.option("header", "true")
.csv("D:\\ideawork\\spark\\src\\main\\scala\\cn\\_51doit\\demo06\\data.csv")
df.createTempView("v_flow")
spark.sql(
s"""
select
|uid,
|min(start_time) start_time,
|max(end_time) end_time,
|sum(flow) total_flow
|from(
| select
| uid,
| start_time,
| end_time,
| flow,
| --将uid分组start_time排序,开fiag窗口做累加,如果值相等就可以当作一次上网时间
| sum(flag) over(partition by uid order by start_time asc) sum_flag
| from(
| select
| uid,
| start_time,
| end_time,
| flow,
| --将start_time减end_time求出时间间隔大于10返回1不大于10返回0要把时间转成时间戳才能计算
| if((to_unix_timestamp(start_time) - to_unix_timestamp(lag_time)) / 60 >10 ,1,0) flag
| from(
| select
| uid,
| start_time,
| end_time,
| flow,
| --将数据按照uid划分窗口,将end_time向后压一行
| lag(end_time,1) over(partition by uid order by start_time) lag_time
| from v_flow
| )t1
| )t2
|)t3 group by uid ,sum_flag
|
|
|""".stripMargin).show()
spark.stop()
}
}
两个表join在一起
import org.apache.spark.sql.{DataFrame, SparkSession}
object JoinDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").getOrCreate()
val orderDF: DataFrame = spark.read.json("D:\\ideawork\\spark\\src\\main\\scala\\cn\\_51doit\\demo06\\ordre.log")
val categoryDF: DataFrame = spark.read
.option("header","true")
.csv("D:\\ideawork\\spark\\src\\main\\scala\\cn\\_51doit\\demo06\\category.txt")
import spark.implicits._
//将两个表join在一起
val joined = orderDF.join(categoryDF, $"cid" === $"id")
val filtered = joined.where($"id" >= 3)
filtered.show()
}
}