窗口函数
-
给聚合函数开操作的窗口(在明细查询中,展示汇总结果)
-- 数据准备 create table business( name string, -- 顾客 orderdate string, -- 下单日期 cost int -- 购买金额 ) row format delimited fields terminated by ','; load data local inpath "/opt/module/hive/datas/business.txt" into table business;
-- 查询2017年4月购买过的顾客和总人数 -- substring(str, pos, len) 返回str字符产从pos(下标从1开始)位置开始长度为len的子串 -- 如果想查询2017年4月购买过的人 select distinct name from business where substring(orderdate, 1,7) = '2017-04'; -- 如果想求2017年4月购买过的人数 select count(distinct name) from business where substring(orderdate, 1,7) = '2017-04'; -- 如果希望将两个结果放在一张表里面展示 select distinct name, count(distinct name) over() from business where substring(orderdate, 1, 7)='2017-04'; -- 查询顾客的购买明细及月购买总额 select name, orderdate, cost, sum(cost) over(partition by substring(orderdate, 1, 7)) month_total from business; -- 查询每个顾客的截止到当日的累计消费 select name, orderdate, cost, sum(cost) over(partition by name order by orderdate rows between UNBOUNDED preceding and current row), -- 累加 sum(cost) over(partition by name order by orderdate rows between 1 preceding and current row) -- 最近两次购买 from business; -- 购买明细和截至当日来过店的顾客 select name, orderdate, cost, collect_set(name) over(order by orderdate rows between UNBOUNDED preceding and current row) from business;
-
其他结合有序窗口使用的函数
-
lag/lead
-- lag(col, n, default_value) over(有序窗口) -- 显示col这一列n行之前的数据,如果没有,展示default_value -- lead(col, n, default_value) over(有序窗口) -- 显示col这一列n行之后的数据,如果没有,展示default_value
-- 查询购买明细和每个人上一次的到店时间 select name, orderdate, cost, lag(orderdate, 1, '1970-01-01') over(partition by name order by orderdate) last_order, lead(orderdate, 1, '2050-01-01') over(partition by name order by orderdate) next_order from business;
-
ntile
-- ntile(n) over(有序窗口) -- 将数据分为n组,返回当前行的组号
-- 订单明细按照下单时间分五组 select name,orderdate,cost, ntile(5) over(order by orderdate) zu from business; -- 如果想查询前20%的订单,怎么查 -- 在上表基础上 select name, orderdate, cost from t1 where zu=1;
-
rank/dense_rank/row_number
-- rank() over (有序窗口) -- rank 是我们常见的排序 -- dense_rank 不会跳过排名 -- row_number 是行号
-- 数据准备 create table score( name string, -- 姓名 subject string, -- 学科 score int -- 分数 ) row format delimited fields terminated by "\t"; load data local inpath '/opt/module/hive/datas/score.txt' into table score;
-- 各科成绩排名 select name,subject,score, rank() over(partition by subject order by score desc) `rank`, dense_rank() over(partition by subject order by score desc) `dense_rank`, row_number() over(partition by subject order by score desc) `row_number` from score; -- 查询各科前三名? -- 在上表基础上,查询rank<=3的 select name,subject,score from t1 where `rank`<=3;
-
first_value/last_value
-