有如下统计的hql:
select *,
row_number() over(partition by stay_time_concat order by time asc) as stay_time_rn
from (
select *,
row_number() over(partition by event_id) as event_id_rn,
row_number() over(partition by concat(j_client_id,buy_adslot_id,event_type) order by time asc) as j_client_id_rn,
row_number() over(partition by concat(j_visitor_id,page_id,event_type) order by event_type) as j_visitor_id_rn,
row_number() over(partition by concat(j_redirect_id,page_id,event_type) order by event_type) as j_redirect_id_rn,
case
when cast(stay_time AS int)>=0 and cast(stay_time AS int) <= 4999 and event_type in ('stay_time', 'page_view')
then concat(j_client_id,buy_adslot_id,'event_type','0s_day')
when cast(stay_time AS int)>=5000 and event_type in ('stay_time', 'page_view') and cast(stay_time AS int) <= 9999
then concat(j_client_id,buy_adslot_id,'event_type','5s_day')
when cast(stay_time AS int)>=10000 and event_type in ('stay_time', 'page_view')
then concat(j_client_id,buy_adslot_id,'event_type','10s_day')
else null
end as stay_time_concat
from tmp.stay_time
where data_date>=${bizStartDate} and data_date<=${bizEndDate}
and from_unixtime(floor(time/1000),'yyyyMMdd')=${etlDate}
)t1 where event_id_rn=1 and page_id is not null
统计时候,由于使用了concat函数,字段长度过长,row_number() over(partition by 字段)会导致一些细微统计误差。
使用concat函数后,stay_time_concat的字符串长度过长,导致统计有误差。
18005e6f2b67f9ff24a2e724222fd8b17885565386692209event_type0s_day
改造后的统计代码如下:
select *,
row_number() over(partition by j_client_id,buy_adslot_id,stay_time_concat order by time asc) as stay_time_rn
from (
select *,
row_number() over(partition by event_id) as event_id_rn,
row_number() over(partition by j_client_id,buy_adslot_id,event_type order by time asc) as j_client_id_rn,
row_number() over(partition by j_visitor_id,page_id,event_type order by event_type) as j_visitor_id_rn,
row_number() over(partition by j_redirect_id,page_id,event_type order by event_type) as j_redirect_id_rn,
case
when cast(stay_time AS int)>=0 and cast(stay_time AS int) <= 4999 and event_type in ('stay_time', 'page_view') then '0s_day'
when cast(stay_time AS int)>=5000 and event_type in ('stay_time', 'page_view') and cast(stay_time AS int) <= 9999 then '5s_day'
when cast(stay_time AS int)>=10000 and event_type in ('stay_time', 'page_view') then '10s_day'
else null
end as stay_time_concat
from tmp.stay_time
where data_date>=${bizStartDate} and data_date<=${bizEndDate}
and from_unixtime(floor(time/1000),'yyyyMMdd')=${etlDate}
)t1 where event_id_rn=1 and page_id is not null