窗口函数和排名函数
窗口函数
格式
# 函数的格式
函数 over( partition by 字段 ,order by 字段 window_clause )
# window_clause的格式
(rows | range) between (unbounded | [num]) preceding and ([num] preceding | current row | (unbounded | [num]) following)
(rows | range) between current row and (current row | (unbounded | [num]) following)
(rows | range) between [num] following and (unbounded | [num]) following
# 窗口函数
rank: 允许并列,一旦有并列跳号!
row_number: 行号! 连续的,每个号之间差1!
dense_rank: 允许并列,一旦有并列不跳号!
cume_dist: 从排序后的第一行到当前值之间数据 占整个数据集的百分比!
precent_rank: rank-1/ 总数据量-1
ntile(x): 将数据集均分到X个组中,返回每条记录所在的组号
数据集
name orderdate cost
jack 2017-01-01 10
tony 2017-01-02 15
jack 2017-02-03 23
tony 2017-01-04 29
jack 2017-01-05 46
jack 2017-04-06 42
tony 2017-01-07 50
jack 2017-01-08 55
mart 2017-04-08 62
mart 2017-04-09 68
neil 2017-05-10 12
mart 2017-04-11 75
neil 2017-06-12 80
mart 2017-04-13 94
小知识
- 排名函数和LAG,LEAD不支持指定窗口大小
- 在over()中既没有出现windows_clause,也没有出现order by,窗口默认为rows between UNBOUNDED PRECEDING and UNBOUNDED FOLLOWING,即从上边界到下边界
- 在over()中(没有出现windows_clause),指定了order by,窗口默认为rows between UNBOUNDED PRECEDING and CURRENT ROW,从上边界到当前记录
- 窗口函数和分组的区别
- 如果是分组操作,select后只能写分组后的字段
- 如果是窗口函数,窗口函数是在指定的窗口内,对每条记录都执行一次函数
- 如果是分组操作,有去重效果,而partition不去重!
聚合过程
-
聚合函数
# 输出每个人的信息以及总共有多少个人 select name,orderdate,cost, count(*) over() from business
数据集
name orderdate cost
jack 2017-01-01 10
tony 2017-01-02 15
jack 2017-02-03 23
tony 2017-01-04 29
jack 2017-01-05 46
jack 2017-04-06 42
tony 2017-01-07 50
jack 2017-01-08 55
mart 2017-04-08 62
mart 2017-04-09 68
neil 2017-05-10 12
mart 2017-04-11 75
neil 2017-06-12 80
mart 2017-04-13 94
例题
(1)查询在2017年4月份购买过的顾客及该顾客购买的次数
select name, count(*) over()
from business
where substring(orderdate,1,7) = '2017-04'
group by name;
# 等价于
select name, count(*) over(rows between unbounded preceding and unbounded following)
from business
where substring(orderdate, 1, 7)='2017-04'
group by name;
(2)查询顾客的购买明细及每个月的购买总额
select name, orderdate, cost, sum(cost) over(partition by name, substring(orderdate, 1, 7))
from business;
如果只需要查询每个人每个月的购买总额
select name, substring(orderdate, 1, 7), sum(cost)
from business
group by name, substring(orderdate, 1, 7);
(3)查询顾客的购买明细要将cost按照日期进行累加(over中有order by时默认是从窗口顶部到当前行)
select name, orderdate, cost, sum(cost) over(partition by name order by orderdate)
from business;
(4)查询顾客的购买明细及顾客上次的购买时间
select name, orderdate, cost, lag(orderdate, 1, '无数据') over(partition by name order by orderdate)
from business;
(5) 查询顾客的购买明细及顾客下次的购买时间
select name, orderdate, cost, lead(orderdate, 1, '无数据') over(partition by name order by orderdate)
from business;
(6) 查询顾客的购买明细及顾客本月第一次购买的时间
select name, orderdate, cost, first_value(orderdate, true) over(partition by name, substring(orderdate, 1, 7) order by orderdate)
from business;
(7) 查询顾客的购买明细及顾客本月最后一次购买的时间
select name, orderdate, cost, last_value(orderdate, true) over(partition by name, substring(orderdate, 1, 7) order by orderdate rows between current ROW and unbounded following)
from business;
(8)查询顾客的购买明细及顾客最近三次cost花费
当前行的和前两次的和
select name, orderdate, cost, sum(cost) over(partition by name order by orderdate rows between 2 preceding and current row)
from business;
当前行和前一行和后一行的和
select name, orderdate, cost, sum(cost) over(partition by name order by orderdate rows between 1 preceding and 1 following)
from business;
(9) 查询前20%时间的订单信息
select *
from
(
select name, orderdate, cost, cume_dist() over(order by orderdate
) cdnum
from business) tmp
where cdnum <= 0.2;
数据集
name subject score
孙悟空 数学 95
宋宋 数学 86
婷婷 数学 85
大海 数学 56
宋宋 英语 84
大海 英语 84
婷婷 英语 78
孙悟空 英语 68
大海 语文 94
孙悟空 语文 87
婷婷 语文 65
宋宋 语文 64
例题
(1)将每个版本进行排名
select name, subject, score, rank() over(partition by subject order by score desc)
from score;
(2)给每个学生计算总分后进行排名
select name, sumscore, rank() over(order by sumscore desc)
from
(
select name, sum(score) sumscore
from score
group by name
) t;
(3)求每个学生的成绩明细及对应学生的总分和总分排名
select name, subject, score, dense_rank() over(order by tmp.sumscore desc)
from
(
select name, subject, score, sum(score) over(partition by name) sumscore
from score
) tmp;
(4)查询每个科目的前2名
select name, subject, score
from
(
select name, subject, score, rank() over(partition by subject order by score desc) rn
from score
) tmp
where rn <= 2;
(5)查询每个学生成绩明细,并显示当前科目的最高分
select name, subject, score, max(score) over(partition by subject)
from score;
select name, subject, score, first_value(score) over(partition by subject order by score desc)
from score;
(6)查询学生成绩,并显示当前科目最低分
select name, subject, score, min(score) over(partition by subject)
from score
select name, subject, score, first_value(score) over(partition by subject order by score )
from score