1. 开窗函数
hive开窗函数总结
1. ROW_NUMBER()、RANK()、DENSE_RANK()
#ROW_NUMBER() 从1开始,按照顺序,生成分组内记录的序列 12345
#RANK() 生成数据项在分组中的排名,排名相等会在名次中留下空位,12335
#DENSE_RANK() 生成数据项在分组中的排名,排名相等会在名次中不会留下空位12334
#按cookieid分组,按pv降序排序
SELECT cookieid,createtime,pv,
ROW_NUMBER() OVER(PARTITION BY cookieid ORDER BY pv DESC) AS rn3
RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS rank1,
DENSE_RANK() OVER(PARTITION BY cookieid ORDER BY pv desc) AS d_rank2
FROM test1
2. sum()、max()、min()、avg()
#avg()同sum(),min()、max()不管加不加 ORDER BY,均为改组内的最大值(最小值)
# 按cookieid分组,按createtime升序,做pv累加
select cookieid,createtime,pv,
sum(pv) over(PARTITION BY cookieid ORDER BY createtime) as pv1
FROM test1
# 若无order by ,按cookieid分组,做分组内的pv总和
select cookieid,createtime,pv,
sum(pv) over(PARTITION BY cookieid ) as pv1
FROM test1
3. LAG 和 LEAD 函数
#LAG(col,n,DEFAULT) 用于统计窗口内往上第n行值
# LEAD (col,n,DEFAULT) 用于统计窗口内往下第n行值
SELECT cookieid,createtime,pv,
LAG(createtime,1,'1970-01-01') OVER(PARTITION BY cookieid ORDER BY createtime) AS lag1,
LAG(createtime,2) OVER(PARTITION BY cookieid ORDER BY createtime) AS lag2
FROM test1
4. FIRST_VALUE 和 LAST_VALUE 函数
#FIRST_VALUE 取分组内排序后,截止到当前行,第一个值
#LAST_VALUE 取分组内排序后,截止到当前行,最后一个值
SELECT cookieid,createtime,pv,
FIRST_VALUE(pv) OVER(PARTITION BY cookieid ORDER BY createtime) AS first
FROM test1