over() 指定函数工作的数据窗口大小
partition by ... 按照...分区
rows between ... and ... 指定运算范围
unbounded preceding 起始行
unbounded following 终止行
n preceding 从前n行数据开始
n following 到后n行数据
例如: rows between 1 preceding and 1 following 从前一行数据到后一行数据 总共三行数据
lag(cloumn,n) 前n行数数据
例如: lag(ctime , 1)
1. 列出连续多天有订单的店铺
1.1 数据
店铺名,日期,订单金额
a,2020-02-10,600
a,2020-03-01,200
a,2020-03-02,300
a,2020-03-03,200
a,2020-03-04,400
a,2020-03-05,600
a,2020-02-05,200
a,2020-02-06,300
a,2020-02-07,200
a,2020-02-08,400
b,2020-02-05,200
b,2020-02-06,300
b,2020-02-08,200
b,2020-02-09,400
b,2020-02-10,600
c,2020-01-31,200
c,2020-02-01,300
c,2020-02-02,200
c,2020-02-03,400
c,2020-02-10,600
1.2 需求 :
统计连续三天有订单的店铺,列出店铺名
1.3 步骤:
1. 创建表---加载数据
---创建表
create table tb_shop(
name string,
ctime string,
cost double
)
row format delimited fields terminated by ',';
---加载本地数据到表中
load data local inpath '/doit17/orders.txt' into table tb_shop;
2. 为了判断数据是连续三天的---按照店铺名分区---对日期进行排序并编号
为了得到结果更快 , 本地运行mapreduce程序 , 不提交Yarn
set mapreduce.framework.name=local;
查看所有函数
show functions;
查看具体函数的用法
desc function 函数;
select
*,
row_number() over(partition by name order by ctime)
from
tb_shop;
---row_number() 给每行数据编号
---partition by name 按照店铺名分区
---order by ctime 相同店铺名按照日期排序
3. 使用函数date_sub将日期与编号相减 , 连续几个结果相同即为连续几天有订单
date_sub(start_date, num_days) - Returns the date that is num_days before start_date.
select
*,
date_sub(ctime , rn) diff
from
(
select
*,
row_number() over(partition by name order by ctime) rn ---对编号起别名
from
tb_shop
) t1 ---对子查询起别名
;
4. 按照店铺名和diff进行分组,并统计组内个数---店铺名相同,diff相同为一组---统计连续天数>3的店铺名和天数
select
name,
diff,
count(1) days
from
(
select
*,
date_sub(ctime , rn) diff
from
(
select
*,
row_number() over(partition by name order by ctime) rn
from
tb_shop
) t1
) t2
group by name,diff
having days > 3
;
5. 去除重复数据--得出最终结果
select
distinct name
from
(
select
name,
diff,
count(1) days
from
(
select
*,
date_sub(ctime , rn) diff
from
(
select
*,
row_number() over(partition by name order by ctime) rn
from
tb_shop
) t1
) t2
group by name,diff
having days > 3
) t3
;
2. 列出连续打中地鼠的人名
2.1 数据
人名,打地鼠次数编号,1代表击中
u01,1,1
u01,2,0
u01,3,1
u01,4,1
u01,5,0
u01,6,1
u02,1,1
u02,2,1
u02,3,0
u02,4,1
u02,5,1
u02,6,0
u02,7,0
u02,8,1
u02,9,1
u03,1,1
u03,2,1
u03,3,1
u03,4,1
u03,5,1
u03,6,0
2.2 创建表-加载数据
create table tb_hit_game(
name string,
num int,
ifhit int
)
row format delimited fields terminated by ',';
load data local inpath '/doit17/game.txt' into table tb_hit_game;
2.3 sql语句
select
distinct name
from
(
select
name,
diff,
count(1) c
from
(
select
*,
(num-bh) diff ---num和行编号相减---按照人名和diff分组,组内结果相同的就是连续打中的
from
(
select
*,
row_number() over(partition by name order by num) bh ---按照人名分区,区内排序,对每行数据编号
from
(
select
*
from
tb_hit_game
where ifhit=1 ---排除没有打中的数据
) t1
) t2
) t3
group by name,diff
having c > 3
) t4
;