我们经常会遇到这样的分析需求,要求按时间(日、月、季、半年、年)+ 地域(大区、分公司)多维度组合来汇总分析结果。 通常情况下需要编写繁琐的sql来实现,比如之前我们是这样做的:
insert into table tmp.tmp_t1
select '月' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;
union all
select '季度' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;
union all
select '半年' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;
union all
select '年' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;
union all
select '月' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '季度' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '半年' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '年' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '月' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '季度' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '半年' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '年' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id from tmp.tbl_a group by .......;
现在可以通过hive 提供的grouping set窗口函数来进行多维度汇总,下面是个实际例子,按日、月、季、半年、年的维度+组织架构层级维度计算客户的数量。
1.数据准备工作
-- 先用工作记录关联日期维度表获取出 年、季度、月份信息
-- 再关联员工信息获取出区域、大区
drop table if exists dw_tmp.tmp_month_emp_work_friend_detail;
create table dw_tmp.tmp_month_emp_work_friend_detail as
select substr(b.day_key,1,10) as date_day,
a.record_id,
b.first_day_m,
b.first_day_q,
b.first_day_hy,
b.first_day_y,
e.area_id,
e.area_name,
e.comp_id,
e.comp_name,
e.emp_id,
e.emp_erp_name
from (
select emp_id,
contact_time,
record_id
from ods.ods_work_record
where dt>=date_sub('${hivevar:dt}',365)
) a
inner join dim.dim_date b on substr(a.contact_time, 1, 10)=substr(b.day_key,1,10)
inner join ( select * from mds.mds_emp where dt='${hivevar:dt}') e on a.emp_id=e.emp_id
;
-- 2.通过GROUPING SET进行多层次、多维度聚合
select GROUPING__ID,
bin(cast(GROUPING__ID as int)) as bin_grouping_id,
date_day,
first_day_m,
first_day_q,
first_day_hy,
first_day_y,
--时间层级
(case when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00001' then 'D'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00010' then 'M'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00100' then 'Q'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='01000' then 'H'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='10000' then 'Y'
else '0' end) as total_flag_id,
(case when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00001' then '天'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00010' then '月'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00100' then '季'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='01000' then '半年'
when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='10000' then '年'
else '0' end) as total_flag_name,
(case when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='1' then 'AREA'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='11' then 'COMP'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='111' then 'DEPT'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='1111' then 'TEAM'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='11111' then 'VR_DEPT'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='111111' then 'VR_DEPT_FLAG'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='1111111' then 'TEAM1'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='11111111' then 'TEAM2'
when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='111111111' then 'EMP'
else '0' end) as dim_level,
area_id,
max(area_name) as area_name,
comp_id,
(case when comp_id is not null then max(comp_name) else null end ) as comp_name,
(case when team_id2 is not null then max(team_name2) else null end ) as team_name2,
emp_id,
(case when emp_id is not null then max(emp_erp_name) else null end ) as emp_name,
count(record_id)
from dw_tmp.tmp_month_emp_work_friend_detail wk
group by date_day,
first_day_m,
first_day_q,
first_day_hy,
first_day_y,
area_id,
comp_id,
emp_id
-- grouping sets进行聚合,我们只需要把各种要聚合的维度排列组合写好就行
-- 使用时会有grouping_id列,此列就是所统计聚合维度的二进制和
grouping sets (
(date_day, area_id),
(date_day, area_id, comp_id),
(date_day, area_id, comp_id, emp_id),
(first_day_m, area_id),
(first_day_m, area_id, comp_id),
(first_day_m, area_id, comp_id, emp_id),
(first_day_q, area_id),
(first_day_q, area_id, comp_id),
(first_day_q, area_id, comp_id, emp_id),
(first_day_hy, area_id),
(first_day_hy, area_id, comp_id),
(first_day_hy, area_id, comp_id, emp_id),
(first_day_y, area_id),
(first_day_y, area_id, comp_id),
(first_day_y, area_id, comp_id, emp_id) )
order by date_day,first_day_m,first_day_q,first_day_hy,first_day_y