窗口函数
一. 什么是窗口函数
基本含义
窗口限定一个范围,它可以理解为满足某些条件的记录集合,窗口函数也就是在窗口范围内执行的函数
基本语法
窗口函数有over关键字,指定函数执行的范围,可分为三部分:分组子句(partition by), 排序子句(order by), 窗口子句(rows)
<函数名> over (partition by <分组的列> order by <排序的列> rows between <起始行> and <终止行>)
注意:Mysql8才支持窗口函数
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-px9leBH9-1680593139140)(file:///Users/zhangyi/Library/Application%20Support/marktext/images/2023-03-22-11-56-43-image.png?msec=1680075248844)]
建表SQL语句
CREATE TABLE student(
cid varchar(4),
sname varchar(4),
score int
);
INSERT INTO student (cid, sname, score)
VALUES
('001', '张三', 78),
('001', '李四', 82),
('002', '小明', 90),
('001', '王五', 67),
('002', '小红', 85),
('002', '小刚', 62)
二. 窗口的确定
select *
sum(score)
over (partition by cid order by score rows between unbounded preceding and unbounded following)
as '班级总分'
from student;
分组子句(partition by)
不分组可以写成partition by null 或者直接不写
后面可以跟多个列,如 partition by cid, sname
注意 partition by与group by的区别
-
前者不会压缩行数但是后者会
-
后者只能选取分组的列和聚合的列
也就是说 group by后生成的结果集与原表的行数和列数都不同
排序子句(order by)
不排序可以写成order by null 或者直接不写
asc或不写代表升序,desc表示降序
后面可以跟多个列,如 order by cid, sname
窗口子句(rows)
窗口子句的描述
-
起始行:N preceding / unbounded preceding
-
当前行: current row
-
终止行: N following / unbounded following
举例:
rows between unbounded preceding and current row 从之前所有的行到当前行
rows between 2 preceding and current row 从前面两行到当前行
rows between current row and unbounded following 从当前行到之后所有的行
rows between current row and 1 following 从当前行到后面一行
注意:
排序子句后面缺少窗口子句,窗口规范默认是 rows between unbounded preceding and current row
排序子句和窗口子句都缺失,窗口规范默认是 rows between unbounded preceding and unbounded following
执行流程
-
通过partition by 和 order by 子句确定大窗口 (定义出上界 unbounded preceding 和下界 unbounded following)
-
通过row子句针对每一行数据确定小窗口(滑动窗口)
-
对每行的小窗口内的数据执行函数并生成新的列
三. 函数分类
排序类
rank, dense_rank, row_number
---- 按班级分组后打上序号 不考虑并列
select *, row_number() over (partition by cid order by score desc) as '不可并列排名' from student;
-- 按班级分组后作跳跃排名 考虑并列
select *, rank() over (partition by cid order by score desc ) as '跳跃🉑️排名' from student;
-- 按班级分组后作连续排名 考虑并列
select *, dense_rank() over (partition by cid order by score desc ) as '连续🉑️并列排名' from student;
-- 合并起来对比
select *, row_number() over (partition by cid order by score desc ) as '不可并列排名',
rank() over (partition by cid order by score desc ) as '跳跃可并列排名',
dense_rank() over (partition by cid order by score desc ) as '连续可并列排名'
from student; [排序类]
聚合类
sum, avg, count, max, min
# 聚合类
-- 容同一班级每个学生都知道班级总分是多少
select *, sum(score) over(partition by cid) as '班级总分' from student;
-- 计算同一班级,每个同学和比他分数低的同学的累计总分是多少
select *, sum(score) over(partition by cid order by score) as '累加分数' from student;
跨行类
lag, lead
# [跨行类]
-- lag/lead 函数 参数1:比较的列;参数2:偏移量;参数3:找不到的默认值
-- 同一班级内,成绩比自己低一名的分数是多少
select *, lag(score, 1) over (partition by cid order by score) as '低一名的分数' from student;
-- 有默认值的写法
select *, lag(score, 1, 0) over (partition by cid order by score) as '低一名的分数' from student;
-- 同一班级内,成绩比自己高2名的分数是多少
select *, lead(score, 2, 0) over (partition by cid order by score) as '高两名的分数' from student;
相关题目
表格
001,张三,语文,78
002,小刚,语文,71
001,李四,数学,56
001,王五,数学,97
002,小明,数学,54
002,小刚,数学,67
002,小红,数学,82
001,王五,语文,80
001,张三,数学,77
002,小明,语文,58
002,小红,语文,87
001,李四,语文,60
001,张三,英语,66
002,小刚,英语,50
001,李四,地理,59
001,王五,地理,88
002,小明,地理,45
002,小刚,地理,66
002,小红,地理,82
001,王五,英语,81
001,张三,地理,77
002,小明,英语,55
002,小红,英语,87
001,李四,英语,61
脚本
# 创建表格
create table transcript (
cid varchar(4),
sname varchar(4),
course varchar(10),
score int
);
insert into transcript (cid, sname, course, score) values ('001', '张三', '语文', 78);
insert into transcript (cid, sname, course, score) values ('002', '小刚', '语文', 71);
insert into transcript (cid, sname, course, score) values ('001', '李四', '数学', 56);
insert into transcript (cid, sname, course, score) values ('001', '王五', '数学', 97);
insert into transcript (cid, sname, course, score) values ('002', '小明', '数学', 54);
insert into transcript (cid, sname, course, score) values ('002', '小刚', '数学', 67);
insert into transcript (cid, sname, course, score) values ('002', '小红', '数学', 82);
insert into transcript (cid, sname, course, score) values ('001', '王五', '语文', 80);
insert into transcript (cid, sname, course, score) values ('001', '张三', '数学', 77);
insert into transcript (cid, sname, course, score) values ('002', '小明', '语文', 58);
insert into transcript (cid, sname, course, score) values ('002', '小红', '语文', 87);
insert into transcript (cid, sname, course, score) values ('001', '李四', '语文', 60);
insert into transcript (cid, sname, course, score) values ('001', '张三', '英语', 66);
insert into transcript (cid, sname, course, score) values ('002', '小刚', '英语', 50);
insert into transcript (cid, sname, course, score) values ('001', '李四', '地理', 59);
insert into transcript (cid, sname, course, score) values ('001', '王五', '地理', 88);
insert into transcript (cid, sname, course, score) values ('002', '小明', '地理', 45);
insert into transcript (cid, sname, course, score) values ('002', '小刚', '地理', 66);
insert into transcript (cid, sname, course, score) values ('002', '小红', '地理', 82);
insert into transcript (cid, sname, course, score) values ('001', '王五', '英语', 81);
insert into transcript (cid, sname, course, score) values ('001', '张三', '地理', 77);
insert into transcript (cid, sname, course, score) values ('002', '小明', '英语', 55);
insert into transcript (cid, sname, course, score) values ('002', '小红', '英语', 87);
insert into transcript (cid, sname, course, score) values ('001', '李四', '英语', 61);
题目
分组内topN
问题1: 求出每个学生成绩最高的三条记录
分析:
(1)按每个学生分组并按成绩排序
(2)从上面已经分好的组中去3条记录
# 先求出每个学生每科成绩的排序
select *, rank() over(partition by sname order by score desc) as '排名' from transcript;
# 再从里面选3条记录
select *
from (
select *, rank() over(partition by sname order by score desc) as rn from transcript
) temp
where rn <= 3;
公式
select *
from (
select *,row_number() over(partition by 分组列 order by 比较列 desc) as rn from table
) temp
where rn <= N;
汇总分析
问题2:找出每门课程都高于班级课程平均分的学生
可以拆解成以下几个问题
(1)求出每个班级,每门课程的平均分
(2)找出每门课程中高于平均分的学生
(3)每门课程都大于平均分 ===> 这名学生每门成绩的del都是大于0
# (1) 分别计算出每门课程的平均分
select *, avg(score) over(partition by course ) as '课程平均分' from transcript;
# (2) 计算出每个学生每门课程与平均分的差值
select *, score - avg as del from (select *, avg(score) over(partition by course ) as 'avg' from transcript) as t1;
# (3) 筛选 del > 0
with t1 as (
select *, avg(score) over(partition by course ) as avg from transcript
),
t2 as (
select *, score - avg as del from t1
)
select sname from t2
group by sname
having min(del) >= 0;
脚本
# 创建员工表
create table employee (
empno varchar(4),
ename varchar(10),
hire_date varchar(10),
salary int,
dept_no varchar(2)
);
insert into employee(empno, ename, hire_date, salary, dept_no) values ('001', 'Adam', '2018-03-01', 1000, 'A');
insert into employee(empno, ename, hire_date, salary, dept_no) values ('002', 'Bill', '2021-03-01', 1200, 'A');
insert into employee(empno, ename, hire_date, salary, dept_no) values ('003', 'Cindy', '2016-03-01', 1500, 'A');
insert into employee(empno, ename, hire_date, salary, dept_no) values ('004', 'Danny', '2020-03-01', 5000, 'A');
insert into employee(empno, ename, hire_date, salary, dept_no) values ('005', 'Eason', '2020-03-01', 4000, 'B');
insert into employee(empno, ename, hire_date, salary, dept_no) values ('006', 'Fred', '2018-03-01', 3500, 'B');
insert into employee(empno, ename, hire_date, salary, dept_no) values ('007', 'Gary', '2017-03-01', 1800, 'B');
insert into employee(empno, ename, hire_date, salary, dept_no) values ('008', 'Hugo', '2020-03-01', 4500, 'B');
分组内topN
问题一:求出每个部门工资最高的前三名员工
select * from (
select *, row_number() over (partition by dept_no order by salary) as rn from employee
) as temp
where rn <= 3;
问题二:计算这些员工的工资占所属部门总工资的百分比
select *, round((salary * 100) / total_salary, 2) as percentage from (
select *, sum(salary) over (partition by dept_no) as total_salary from employee
) temp
问题三:对各部门员工的工资进行从小到大排序,排名前30%为底层,30%-80% 为中层,高于80%为高层,并打上标签
with t1 as (
select *, row_number() over (partition by dept_no order by salary) as rn,
count(*) over (partition by dept_no) as total from employee
),
t2 as (
select *, round((rn * 100/ total), 2) as percentage from t1
)
select *, case when percentage <= 30 then '低层'
when percentage <= 80 then '中层'
when percentage <= 100 then '高层'
end as label from t2;
问题四:统计每年入职总数以及截至本年累计入职总人数(本年总入职人数 + 本年之前所有年的总入职人数之和)PS: 假设今年是2020年
# 分别统计每年的入职总数
select hire_date, count(empno) as each_count from employee
group by hire_date
order by hire_date;
# 截止到2020年之前的入职总人数
select sum(each_count) as total_emp from (
select hire_date, count(empno) as each_count from employee
group by hire_date
order by hire_date
) temp
where year(hire_date) >= 2020;
分析:❌不可以使用窗口函数的原因
窗口函数会把分别的年都罗列到表格中,最后使用SUM()聚合函数时会出错
SQL窗口函数(二)—— 连续问题
题目一
表格
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Szl3xZP2-1680593139142)(file:///Users/zhangyi/Library/Application%20Support/marktext/images/2023-03-30-20-02-51-image.png?msec=1680177771637)]
脚本
create table user (
user_id varchar(2),
login_date date
);
insert into user (user_id, login_date) VALUES
('A', '2022-09-02'),
('A', '2022-09-03'),
('A', '2022-09-04'),
('B', '2021-11-25'),
('B', '2021-12-31'),
('C', '2022-01-01'),
('C', '2022-04-04'),
('C', '2022-09-03'),
('C', '2022-09-05'),
('C', '2022-09-04'),
('A', '2022-09-03'),
('D', '2022-10-20'),
('D', '2022-10-21'),
('A', '2022-10-03'),
('D', '2022-10-22'),
('D', '2022-10-23');
问题
找出这张表中所有的连续3天登陆用户
分析
连续N天登陆用户,要求数据行满足以下条件:
-
userid要相同,表示同一用户
-
用同一用户每行记录以登陆时间从小到大排序
-
后一行记录比前一行记录的登陆时间多一天
-
数据行数大于等于N
with t0 as (
select distinct user_id, login_date from user
),
t1 as (
select *, rank() over (partition by user_id order by login_date) as rn from t0
),
t2 as(
select *, subdate(login_date, interval rn day) as sub from t1
)
select distinct user_id from t2
group by user_id, sub
having count(user_id) >= 3; # count(1)
方法II
窗口函数:lag, lead函数
# 表去重
select distinct user_id, login_date from user;
# 窗口函数 - lag ---> 构建新的列(比login_date少1天)
select *, lag(login_date, 1 ) over (partition by user_id order by login_date) as last_login_date from user;
# login_date last_login_date 两列相减取diff
with t0 as (
select distinct user_id, login_date from user
),
t1 as(
select *, lag(login_date, 1) over (partition by user_id order by login_date) as last_login_date from t0
)
select *, datediff(login_date, last_login_date) as diff from t1;
# 从 diff中筛选 === 1 且按 user_id 分组后
with t0 as (
select distinct user_id, login_date from user
),
t1 as(
select *, lag(login_date, 1) over (partition by user_id order by login_date) as last_login_date from t0
),
t2 as (
select *, datediff(login_date, last_login_date) as diff from t1
)
select user_id from t2
where diff = 1
group by user_id
having count(1) >= 2
题目二
表格
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-WKcLNiZg-1680593139142)(file:///Users/zhangyi/Library/Application%20Support/marktext/images/2023-03-31-15-15-24-image.png?msec=1680246924399)]
脚本
# 创建玩家表
create table player (
player_id varchar(2),
score int,
score_time datetime
);
insert into player (player_id, score, score_time)
VALUES ('B3', 1 , '2022-09-20 19:00:14'),
('A2', 1, '2022-09-20 19:01:04'),
('A2', 3, '2022-09-20 19:01:16'),
('A2', 3, '2022-09-20 19:02:05'),
('A2', 2, '2022-09-20 19:02:25'),
('B3', 2, '2022-09-20 19:02:54'),
('A4', 3, '2022-09-20 19:03:10'),
('B1', 2, '2022-09-20 19:03:34'),
('B1', 2, '2022-09-20 19:03:58'),
('B1', 3, '2022-09-20 19:04:07'),
('A2', 1, '2022-09-20 19:04:19'),
('B3', 2, '2022-09-20 19:04:31');
问题
统计出连续三次及以上为球队得分的球员名单
分析
连续N次以上为球队得分,要求数据行满足以下条件:
-
player_id 要相同表示同一队员
-
每行记录以得分时间从小到大排序
-
数据行数大于等于N
类似于上一题中的lag方法,构建第三列 - 上一位得分的player_id, 如果player_id 和 last_player_id都相同,则说明此球员是连续得分
with t0 as (
select distinct player_id, score_time from player
),
t1 as (
select *, lag(player_id, 1) over(order by score_time) as last_player_id from t0
)
select player_id from t1
where player_id = last_player_id
group by player_id
having count(1) >= 2; # 这里写2 因为有两个B1已经说明连续三次得分了
题目三
表格
log_id |
---|
1 |
2 |
3 |
7 |
8 |
10 |
脚本
# 创建login表格
create table login (
log_id int
);
insert into login(log_id) values (1), (2), (3), (7), (8), (10);
问题
编写SQL查询得到Logs表中的连续区间的开始数字和结束数字,按照start_id 排序,查询结果格式如下
start_id | end_id |
---|---|
1 | 3 |
7 | 8 |
10 | 10 |
解答
# (1) 构建第三行 rank 进行排名
select *, rank() over (order by log_id) as rn from login;
# (2) log_id - rn 形成差值
with t1 as (
select *, rank() over (order by log_id) as rn from login
),
t2 as (
select *, (log_id - rn) as diff from t1
)
select * from t2;
# (3) diff相同代表两个数字是递增的, start_id ==> 分区中最小的id, end_id ==> 分区中最大的id
with t1 as (
select *, rank() over (order by log_id) as rn from login
),
t2 as (
select *, (log_id - rn) as diff from t1
)
select min(log_id) as start_id,
max(log_id) as end_id from t2
group by diff
order by diff
技巧
如何求连续区间?
- 行号过滤法
通过row_number()
生成连续行号,与区间列进行差值运算,得到的临时结果如果相同表示为同一连续区间
- 错位比较法
通过row_number()/ row_number() + 1
分别生成原生的和错位的连续行号列,进行连表操作也可以通过lag/lead()
函数直接生成错位列