最近微博有一个任务需要使用日曝光表做统计,发现这个任务很容易在stage1卡在reduce100%阶段不动,发邮件过来让帮找一下问题或者优化一下,和同事一起分析了一下原因,以下是分析过程:
1、分析hql
add file get_recept_mid.sh;
add file get_last_expo_mid.sh;
add file get_real_recept.py;
insert overwrite directory '/user/liangjun/hive-liangjun/'
select transform(t2.uid,t7.mid,t2.expo_mid) using 'python get_real_recept.py' as uid,real_recept
from (
select transform(t6.*) using 'sh get_recept_mid.sh' as uid,mid
from ( select t3.fans_uid,t4.mid from ( select fans_uid,uid from mds_user_fanslist where dt=20140926 ) t3 join (
select transform(t5.*) using 'sh get_recept_mid.sh' as uid,mid
from ( select uid,mid from mds_bhv_pubblog where dt=20140926 distribute by uid sort by uid ) t5
) t4 on t3.uid=t4.uid distribute by t3.fans_uid sort by t3.fans_uid ) t6 ) t7 join (
select transform(t1.*) using 'sh get_recept_mid.sh' as uid,expo_mid
from ( select t8.* from (
select transform(ods_tblog_expo.uid,ods_tblog_expo.mid_list)
using 'sh get_last_expo_mid.sh' as uid,mid
from ods_tblog_expo where dt=20140926 and interface_id in ('1','5') ) t8 distribute by t8.uid sort
by t8.uid )