转载: http://wolfskin.blog.163.com/blog/static/2081731282013812104016406/
目前的HIve版本已经支持mult-distinct的特性,这个在使用的使用的会比较方便,平时经常同时统计PV,UV,VV之类,不过一般都只统计一两天的数据,虽然每天的数据都是上亿条的但也特别感觉有啥不妥。
但是这样就涉及到multi-distinct,当数据特大的时候就可能带来性能的恶化,解决方案就是去掉multi-distinct,最后全部用sum(1)来达到目的select sum(1) as pv
, count(distinct pvid) as vv
, count(distinct cookie_id) as uv
from sample_tb;
第二步以空间换时间,扩充原数据,借用union all把需要做distinct的字段给扩充起来,并使用rownumber=1来达到去重的目的,如果不计算PV的话则可以直接用group by,而绕开rownumber=1的去重目的。create table sample_step_1
as
select pvid, cookie_id
, sum(1) as pv
from sample_tb
group by pvid, cookie_id;
第三步就是使用sum()来替换count(distinct()),而计算各种值了create table sample_step_2 as select type, type_value, pv, rownumber(type,type_value) as rn from ( select type, type_value, pv from ( select 'pvid'as type, pvid as type_value, pv from sample_step_1 union all select 'cookie_id'as type, cookie_id as type_value, pv from sample_step_1 ) a distribute by type, type_value sort by type, type_value ) a;
select sum(case when type='pvid' then pv else cast(0 as bigint) end) as pv, sum(case when type='pvid' and rn=1 then 1 else 0 end) as vv, sum(case when type='cookie_id' and rn=1 then 1 else 0 end) as uv from sample_step_2;