Hive窗口函数使用案例

1.数据源

vim /opt/data/user_purchase_detail.txt

Semi,2021-09-01,500,10.10.10.9
Edward,2021-09-02,3500,10.10.10.10
Semi,2021-02-03,46,10.10.10.9
Edward,2021-09-04,578,10.10.10.10
Semi,2021-09-05,345,10.10.10.9
Semi,2021-04-06,235,10.10.10.9
Edward,2021-09-07,78,10.10.10.10
Semi,2021-09-08,55,10.10.10.9
zhangsan,2021-04-08,783,10.10.10.11
zhangsan,2021-04-09,123,10.10.10.11
lisi,2021-05-10,150,10.10.10.12
zhangsan,2021-04-11,456,10.10.10.11
lisi,2021-06-12,234,10.10.10.12
zhangsan,2021-04-13,99,10.10.10.11

2.初始化数据

create table if not exists user_purchase_detail(
name string ,
date string,
amount int,
ip_address string
)
row format delimited fields terminated by ',';

load data local inpath '/opt/data/user_purchase_detail.txt' overwrite into table  user_purchase_detail;

3.指标统计

(1)用户购买明细及上次的购买时间

# SQL:

select name,date,amount,ip_address,
lag(date) over(partition by name order by date) last_date from user_purchase_detail;

# 结果:
name		date		amount	ip_address	last_date
lisi		2021-05-10	150		10.10.10.12	NULL
lisi		2021-06-12	234		10.10.10.12	2021-05-10
Semi			2021-02-03	46		10.10.10.9	NULL
Semi			2021-04-06	235		10.10.10.9	2021-02-03
Semi			2021-09-01	500		10.10.10.9	2021-04-06
Semi			2021-09-05	345		10.10.10.9	2021-09-01
Semi			2021-09-08	55		10.10.10.9	2021-09-05
Edward	2021-09-02	3500	10.10.10.10	NULL
Edward	2021-09-04	578		10.10.10.10	2021-09-02
Edward	2021-09-07	78		10.10.10.10	2021-09-04
zhangsan	2021-04-08	783		10.10.10.11	NULL
zhangsan	2021-04-09	123		10.10.10.11	2021-04-08
zhangsan	2021-04-11	456		10.10.10.11	2021-04-09
zhangsan	2021-04-13	99		10.10.10.11	2021-04-11
Time taken: 6.126 seconds, Fetched: 14 row(s)

(2)用户购买明细及下次的购买时间

#SQL:
select name,date,amount,ip_address,
lead(date) over(partition by name order by date) next_date from user_purchase_detail;

#结果:
name		date		amount	ip_address	next_date
lisi		2021-05-10	150		10.10.10.12	2021-06-12
lisi		2021-06-12	234		10.10.10.12	NULL
Semi			2021-02-03	46		10.10.10.9	2021-04-06
Semi			2021-04-06	235		10.10.10.9	2021-09-01
Semi			2021-09-01	500		10.10.10.9	2021-09-05
Semi			2021-09-05	345		10.10.10.9	2021-09-08
Semi			2021-09-08	55		10.10.10.9	NULL
Edward	2021-09-02	3500	10.10.10.10	2021-09-04
Edward	2021-09-04	578		10.10.10.10	2021-09-07
Edward	2021-09-07	78		10.10.10.10	NULL
zhangsan	2021-04-08	783		10.10.10.11	2021-04-09
zhangsan	2021-04-09	123		10.10.10.11	2021-04-11
zhangsan	2021-04-11	456		10.10.10.11	2021-04-13
zhangsan	2021-04-13	99		10.10.10.11	NULL
Time taken: 8.26 seconds, Fetched: 14 row(s)

(3)用户购买明细及本月第一次购买的时间

# SQL:
select name,date,amount,ip_address,
first_value(date) over (partition by name,substr(date,6,2) order by date) first_purchase from user_purchase_detail;

# 结果:
name		date		amount	ip_address	first_purchase
lisi		2021-05-10	150		10.10.10.12	2021-05-10
lisi		2021-06-12	234		10.10.10.12	2021-06-12
Semi			2021-02-03	46		10.10.10.9	2021-02-03
Semi			2021-04-06	235		10.10.10.9	2021-04-06
Semi			2021-09-01	500		10.10.10.9	2021-09-01
Semi			2021-09-05	345		10.10.10.9	2021-09-01
Semi			2021-09-08	55		10.10.10.9	2021-09-01
Edward	2021-09-02	3500	10.10.10.10	2021-09-02
Edward	2021-09-04	578		10.10.10.10	2021-09-02
Edward	2021-09-07	78		10.10.10.10	2021-09-02
zhangsan	2021-04-08	783		10.10.10.11	2021-04-08
zhangsan	2021-04-09	123		10.10.10.11	2021-04-08
zhangsan	2021-04-11	456		10.10.10.11	2021-04-08
zhangsan	2021-04-13	99		10.10.10.11	2021-04-08
Time taken: 72.55 seconds, Fetched: 14 row(s)

(4)用户购买明细及本月最后一次购买的时间

# SQL:
select name,date,amount,ip_address,
first_value(date) over (partition by name,substr(date,6,2) order by date desc) last_purchase from user_purchase_detail;

# 结果:

name		date		amount	ip_address	last_purchase
lisi		2021-05-10	150		10.10.10.12	2021-05-10
lisi		2021-06-12	234		10.10.10.12	2021-06-12
Semi			2021-02-03	46		10.10.10.9	2021-02-03
Semi			2021-04-06	235		10.10.10.9	2021-04-06
Semi			2021-09-08	55		10.10.10.9	2021-09-08
Semi			2021-09-05	345		10.10.10.9	2021-09-08
Semi			2021-09-01	500		10.10.10.9	2021-09-08
Edward	2021-09-07	78		10.10.10.10	2021-09-07
Edward	2021-09-04	578		10.10.10.10	2021-09-07
Edward	2021-09-02	3500	10.10.10.10	2021-09-07
zhangsan	2021-04-13	99		10.10.10.11	2021-04-13
zhangsan	2021-04-11	456		10.10.10.11	2021-04-13
zhangsan	2021-04-09	123		10.10.10.11	2021-04-13
zhangsan	2021-04-08	783		10.10.10.11	2021-04-13

# last_value是默认到当前行,给一个到最后一行的窗口尺寸即可.
select name,date,amount,ip_address,last_value(date) over (partition by name,month(date) order by date  ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING)  last_purchase from user_purchase_detail;

name		date		amount	ip_address	last_purchase
lisi		2021-05-10	150		10.10.10.12	2021-05-10
lisi		2021-06-12	234		10.10.10.12	2021-06-12
Semi			2021-02-03	46		10.10.10.9	2021-02-03
Semi			2021-04-06	235		10.10.10.9	2021-04-06
Semi			2021-09-01	500		10.10.10.9	2021-09-08
Semi			2021-09-05	345		10.10.10.9	2021-09-08
Semi			2021-09-08	55		10.10.10.9	2021-09-08
Edward	2021-09-02	3500	10.10.10.10	2021-09-07
Edward	2021-09-04	578		10.10.10.10	2021-09-07
Edward	2021-09-07	78		10.10.10.10	2021-09-07
zhangsan	2021-04-08	783		10.10.10.11	2021-04-13
zhangsan	2021-04-09	123		10.10.10.11	2021-04-13
zhangsan	2021-04-11	456		10.10.10.11	2021-04-13
zhangsan	2021-04-13	99		10.10.10.11	2021-04-13
Time taken: 20.816 seconds, Fetched: 14 row(s)

(5)用户购买明细及每月总额

#SQL:
select name,date,amount,ip_address,sum(amount) over(partition by name,substr(date,6,2)) sum_amount from user_purchase_detail;

#结果:
name		date		amount	ip_address	sum_amount
lisi		2021-05-10	150		10.10.10.12	150
lisi		2021-06-12	234		10.10.10.12	234
Semi			2021-02-03	46		10.10.10.9	46
Semi			2021-04-06	235		10.10.10.9	235
Semi			2021-09-01	500		10.10.10.9	900
Semi			2021-09-05	345		10.10.10.9	900
Semi			2021-09-08	55		10.10.10.9	900
Edward	2021-09-07	78		10.10.10.10	4156
Edward	2021-09-04	578		10.10.10.10	4156
Edward	2021-09-02	3500	10.10.10.10	4156
zhangsan	2021-04-13	99		10.10.10.11	1461
zhangsan	2021-04-11	456		10.10.10.11	1461
zhangsan	2021-04-08	783		10.10.10.11	1461
zhangsan	2021-04-09	123		10.10.10.11	1461
Time taken: 4.091 seconds, Fetched: 14 row(s)

(6)用户购买明细及金额按日期累加

#SQL:
select name,date,amount,ip_address,sum(amount) over(partition by name order by date) sum_amount from user_purchase_detail;

#结果:
name		date		amount	ip_address	sum_amount
lisi		2021-05-10	150		10.10.10.12	150
lisi		2021-06-12	234		10.10.10.12	384
Semi			2021-02-03	46		10.10.10.9	46
Semi			2021-04-06	235		10.10.10.9	281
Semi			2021-09-01	500		10.10.10.9	781
Semi			2021-09-05	345		10.10.10.9	1126
Semi			2021-09-08	55		10.10.10.9	1181
Edward	2021-09-02	3500	10.10.10.10	3500
Edward	2021-09-04	578		10.10.10.10	4078
Edward	2021-09-07	78		10.10.10.10	4156
zhangsan	2021-04-08	783		10.10.10.11	783
zhangsan	2021-04-09	123		10.10.10.11	906
zhangsan	2021-04-11	456		10.10.10.11	1362
zhangsan	2021-04-13	99		10.10.10.11	1461
Time taken: 6.109 seconds, Fetched: 14 row(s)

(7)用户购买明细及最近三次的总额

#思路:
// 第一:分组排序 partition by name order by date
// 第二:计算总额
// 		 默认 当前行+上一行+下一行
// 		 如果 上一行为null 则为 当前行+下两行
// 	     如果 下一行为null 则为 当前行+上两行

#SQL:
select name,date,amount,ip_address,
sum(amount+if(lag_amount == 0,lead2_amount,lag_amount)+if(lead_amount == 0,lag2_amount,lead_amount)) total_count
from
(select name,date,amount,ip_address,
lag(amount,1,0) over(partition by name order by date) lag_amount,
lead(amount,1,0) over(partition by name order by date)  lead_amount,
lag(amount,2,0) over(partition by name order by date) lag2_amount,
lead(amount,2,0) over(partition by name order by date)  lead2_amount
from user_purchase_detail) 
a group by name,date,amount,ip_address;

#结果:
name		date		amount	ip_address	total_count
lisi		2021-05-10	150		10.10.10.12	384
lisi		2021-06-12	234		10.10.10.12	384
Semi			2021-02-03	46		10.10.10.9	781
Semi			2021-04-06	235		10.10.10.9	781
Semi			2021-09-01	500		10.10.10.9	1080
Semi			2021-09-05	345		10.10.10.9	900
Semi			2021-09-08	55		10.10.10.9	900
Edward	2021-09-02	3500	10.10.10.10	4156
Edward	2021-09-04	578		10.10.10.10	4156
Edward	2021-09-07	78		10.10.10.10	4156
zhangsan	2021-04-08	783		10.10.10.11	1362
zhangsan	2021-04-09	123		10.10.10.11	1362
zhangsan	2021-04-11	456		10.10.10.11	678
zhangsan	2021-04-13	99		10.10.10.11	678

(8)查询前30%时间的订单信息

#SQL:
select name,date,amount,ip_address from
(select name,date,amount,ip_address,
round(CUME_DIST () over(order by date),2) percent
from user_purchase_detail
) a where percent <=0.3;


#结果:
name		date		amount	ip_address	percent
Semi			2021-02-03	46		10.10.10.9	0.07
Semi			2021-04-06	235		10.10.10.9	0.14
zhangsan	2021-04-08	783		10.10.10.11	0.21
zhangsan	2021-04-09	123		10.10.10.11	0.29
Time taken: 4.073 seconds, Fetched: 4 row(s)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值