HIVESQL窗口函数还是很好用的。MARK一下基本概念和语法。
1.累计计算:sum()、avg()
2.排序:row_number(),rank(),dense_rank()
3.前百分之多少:ntile()
4.偏移分析:lag() ,lead()
1.累计计算:sum()、avg()
1.1sum()累计
a)开始时间为0 第一条数据开始
select a.month,
a.pay_amount_1, --结果,非原始字段
sum(a.pay_amount_1) over(order by a.month)--排序
from (select month(dt) month, --分类,按月提取值
sum(pay_amount) pay_amount_1 --汇总
from user_trade
where year(dt)=2018
group by month(dt)) a;
b)开始时间为当年(partition by 分片)
select a.year,
a.month,
a.pay_amount_1,
sum(a.pay_amount_1) over(partition by a.year order by a.month ASC)--partition by
from (select year(dt) year,
month(dt) month,
sum(pay_amount) pay_amount_1
from user_trade
where year(dt) in(2017,2018) --两个年份
group by year(dt),
month(dt)) a;
c)容易出现的错误
select a.year,
a.month,
a.pay_amount_1,
sum(a.pay_amount_1) over(partition by a.year ,a.month order by a.month ASC)
from (select year(dt) year, -- a.month没有意义
month(dt) month,
sum(pay_amount) pay_amount_1
from user_trade
where year(dt) in(2017,2018) --两个年份
group by year(dt),
month(dt)) a;
1.2avg()平均
3日平均,或者3月平均,一般就有移动平均值(每三日,每三月的平均)
-- 求每个月的近三月移动平均支出金额--
--1:month1
--2:(month1+month2)/2
--3.(month1+month2+month3)/3
--4.(month2+month3+month4)/3
select a.month,
a.pay_amount_1, --结果,非原始字段
avg(a.pay_amount_1) over(order by a.month rows between 2 preceding and current row)--2行之前 和当前行
from (select month(dt) month, --分类,按月提取值
sum(pay_amount) pay_amount_1
from user_trade
where year(dt)=2018
group by month(dt)) a;
2.排序:row_number(),rank(),dense_rank()
row_number():123
rank():113
dense_rank():112
2.1基础排序
-- 1.1基础排序
-- 购买商品品类数量的排名
select user_name, --
count(distinct goods_category),--按goods_category排序
row_number() over(order by count(distinct goods_category)),--1234
rank() over(order by count(distinct goods_category)), -- 113
dense_rank() over(order by count(distinct goods_category)),--112
from user_trade
where substr(dt,1,7)='2019-01' --
Group by user_name; --用户名
2.2 选出10.20.30 排名的用户
--where a.rank in(10,20,30)
select a.user_name, --1.显示字段
a.pay_amount,
a.rank
from (select user_name, --2.子查询表
sum(pay_amount) pay_amount,
rank() over(order by sum(pay_amount)) rank
from user_trade
where year(dt)=2019 --时间
Group by user_name)a -- a表选择
where a.rank in(10,20,30); --3.选择
3.前百分之多少:ntile()
3.1 注意点:整体分组 a)存在不能被整除的情况 b)整除的情况 c)不可以只分前25%,partition by的分组,不能用row between。
--2019年1月,按支付金额分5组
select user_name,
sum(pay_amount) pay_amount,
ntile(5) over(order by sum(pay_amount)desc) level --分成5份
from user_trade
where year(dt,1,7)='2019-01'
Group by user_name;
3.2退款金额前10%的用户
10%分10组以后的第一组
--选出2019年退款金额排名前10%的用户
select a.user_name
from
(select user_name,
sum(refund_amount) as refund_amount,
ntile(10) over(order by sum(refund_amount)desc) level
from user_trade
where year(dt)=2019
group by user_name)a
where a.level=1; --level=1
4.偏移分析:lag() ,lead()
同一字段的前N行(lag),后N行(lead)
exp_str:原始字段
offset:偏移量
defval:默认值
--偏移 前一个
select user_name,
dt, --partition分组
lag(dt,1,dt) over(partition by user_name order by dt),--前1个
lag(dt) over(partition by user_name order by dt), --前1个空值时返回dt
lag(dt,2,dt) over(partition by user_name order by dt),--前2个
lag(dt,2) over(partition by user_name order by dt)--前2个空值时返回dt
from user_trade
where dt>'0'
and user_name in('ALice','Alexander');-- 按姓名自己排序
-- 偏移 下一个
select user_name,
dt,
lead(dt,1,dt) over(partition by user_name order by dt),
lead(dt) over(partition by user_name order by dt), --后1个空值时返回空值
lead(dt,2,dt) over(partition by user_name order by dt),
lead(dt,2) over(partition by user_name order by dt)
from user_trade
where dt>'0'
and user_name in('ALice','Alexander');