【hive】grouping sets函数 多维度数据生成

我们经常会遇到这样的分析需求,要求按时间(日、月、季、半年、年)+ 地域(大区、分公司)多维度组合来汇总分析结果。 通常情况下需要编写繁琐的sql来实现,比如之前我们是这样做的:

insert into table tmp.tmp_t1
select '月' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;
union all
select '季度' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;
union all
select '半年' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;
union all
select '年' AS total_flag_name ,base.comp_id ,base.comp_name ,base.area_id from tmp.tbl_a group by .......;

union all

select '月' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '季度' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '半年' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '年' AS total_flag_name ,base.comp_id ,base.comp_name ,null as area_id from tmp.tbl_a group by .......;

union all

select '月' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '季度' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id from tmp.tbl_a group by .......;
union all
select '半年' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id  from tmp.tbl_a group by .......;
union all
select '年' AS total_flag_name ,base.comp_id ,null as comp_name ,null as area_id from tmp.tbl_a group by .......;

 现在可以通过hive 提供的grouping set窗口函数来进行多维度汇总,下面是个实际例子,按日、月、季、半年、年的维度+组织架构层级维度计算客户的数量。

1.数据准备工作
--   先用工作记录关联日期维度表获取出 年、季度、月份信息
--   再关联员工信息获取出区域、大区
drop table if exists dw_tmp.tmp_month_emp_work_friend_detail;
create table dw_tmp.tmp_month_emp_work_friend_detail as
     select substr(b.day_key,1,10) as date_day,
            a.record_id,
            b.first_day_m,
            b.first_day_q,
            b.first_day_hy,
            b.first_day_y,
            e.area_id,
            e.area_name,
            e.comp_id,
            e.comp_name,
            e.emp_id,
            e.emp_erp_name
     from (
            select emp_id,
                     contact_time,
                     record_id 
            from ods.ods_work_record 
            where dt>=date_sub('${hivevar:dt}',365)
            ) a
        inner join dim.dim_date b on substr(a.contact_time, 1, 10)=substr(b.day_key,1,10)
        inner join ( select * from mds.mds_emp where dt='${hivevar:dt}') e on a.emp_id=e.emp_id
;

-- 2.通过GROUPING SET进行多层次、多维度聚合

select GROUPING__ID,
        bin(cast(GROUPING__ID as int)) as bin_grouping_id,
        date_day,
        first_day_m,
        first_day_q,
        first_day_hy,
        first_day_y,
        --时间层级
        (case when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00001' then 'D'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00010' then 'M'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00100' then 'Q'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='01000' then 'H'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='10000' then 'Y'
        else '0' end) as total_flag_id,

        (case when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00001' then '天'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00010' then '月'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='00100' then '季'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='01000' then '半年'
             when substr(bin(cast(GROUPING__ID as int)),length(bin(cast(GROUPING__ID as int)))-4)='10000' then '年'
        else '0' end) as total_flag_name,

        (case when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='1' then 'AREA'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='11' then 'COMP'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='111' then 'DEPT'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='1111' then 'TEAM'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='11111' then 'VR_DEPT'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='111111' then 'VR_DEPT_FLAG'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='1111111' then 'TEAM1'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='11111111' then 'TEAM2'
             when substr(bin(cast(GROUPING__ID as int)),1,length(bin(cast(GROUPING__ID as int)))-5)='111111111' then 'EMP'
        else '0' end) as dim_level,
        area_id,
        max(area_name) as area_name,
        comp_id,
        (case when comp_id is not null then max(comp_name) else null end ) as comp_name,
        (case when team_id2 is not null then max(team_name2) else null end ) as team_name2,
        emp_id,
        (case when emp_id is not null then max(emp_erp_name) else null end ) as emp_name,
        count(record_id)
from dw_tmp.tmp_month_emp_work_friend_detail wk
group by date_day,
        first_day_m,
        first_day_q,
        first_day_hy,
        first_day_y,
        area_id,
        comp_id,
        emp_id

-- grouping sets进行聚合,我们只需要把各种要聚合的维度排列组合写好就行
-- 使用时会有grouping_id列,此列就是所统计聚合维度的二进制和
grouping sets  (
        (date_day, area_id),
        (date_day, area_id, comp_id),
        (date_day, area_id, comp_id, emp_id),

        (first_day_m, area_id),
        (first_day_m, area_id, comp_id),
        (first_day_m, area_id, comp_id, emp_id),

        (first_day_q, area_id),
        (first_day_q, area_id, comp_id),
        (first_day_q, area_id, comp_id, emp_id),

        (first_day_hy, area_id),
        (first_day_hy, area_id, comp_id),
        (first_day_hy, area_id, comp_id, emp_id),

        (first_day_y, area_id),
        (first_day_y, area_id, comp_id),
        (first_day_y, area_id, comp_id, emp_id) )
order by date_day,first_day_m,first_day_q,first_day_hy,first_day_y

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值