集群182个节点,一天的数据量20亿条,查询网站一天的流量数据:uv、pv、ip、cookie、onlinetime,其中uv、ip、cookie 需要distinct去重。reduce到达99%的时候,就卡死了,由于多个distinct 加上数据倾斜造成的。
优化之前的sql:
select sum(case when d.pv_flag=1 then 1 else 0 end) as pv,count(distinct id) as uv,count(distinct ip) as ip,sum(d.otime),count(distinct cookie),'$STA_TYPE', '$STA_TYPE' from access_dap d where log_date='$YESTERDAY' ;
优化之后的sql:
1.去重汇总
2.以空间换时间,借用union all的把数据根据distinct的字段扩充起来,假如有8个distinct,相当于数据扩充8倍,用rownumber=1来达到间接去重的目的,如果这里不计算整体pv的话,可以直接进行Group by效果一样。这里的unionall只走一个job,不会因为job多拖后腿(hadoop不怕数据量大【一定范围内】,就怕job多和数据倾斜)。
3.得到最终结果,没有一个distinct,全部走的是普通sum,可以在mapper端提前聚合,会很快
完整的sql:
create temporary function rownumber as 'com.renren.acorn.udf.RowNumber';
drop table if exists tmp_site_global_access_distinct_1_$DATE;
drop table if exists tmp_site_global_access_distinct_2_$DATE;
create table tmp_site_global_access_distinct_1_$DATE as select id,ip,cookie,idis_zero,sum(case when pv_flag=1 then 1 else 0 end) as pv,sum(otime) as onlinetime from ${TEMP_ACCESS_TABLE}${DATE} group by id,ip,cookie,idis_zero;
drop table if exists tmp_site_global_access_distinct_2_$DATE;
create table tmp_site_global_access_distinct_2_$DATE as select type,type_value,rownumber(type,type_value) as rn,pv,onlinetime from
(
select type,type_value,pv,onlinetime from
(
select 'id' as type,cast(id as string) as type_value,pv,onlinetime from tmp_site_global_access_distinct_1_$DATE where idis_zero=0 union all select 'ip' as type,ip as type_value,pv,onlinetime from tmp_site_global_access_distinct_1_$DATE union all
select 'cookie' as type,case when cookie='null' then 'acorn_cookie' else cookie end as type_value,pv,onlinetime from tmp_site_global_access_distinct_1_$DATE
) t1 cluster by type,type_value
) t2;
select sum(case when type='ip' then pv else cast(0 as bigint) end) as pv,
sum(case when type='id' and rn=1 then 1 else 0 end) as uv,
sum(case when type='ip' and rn=1 then 1 else 0 end) as ip,
sum(case when type='ip' then onlinetime else cast('0' as bigint) end) as onlinetime,
sum(case when type='cookie' and rn=1 then 1 else 0 end) as cookie,
'$STA_TYPE','$STA_TYPE'
from tmp_site_global_access_distinct_2_$DATE;
drop table if exists tmp_site_global_access_distinct_1_$DATE;
drop table if exists tmp_site_global_access_distinct_2_$DATE;
优化之前整个过程需要1个小时,而且有可能在99%的时候卡死,优化之后只需要不到10分钟;
RowNumber代码为:
public class RowNumber extends UDF {
private static int MAX_VALUE = 50;
private static String comparedColumn[] = new String[MAX_VALUE];
private static int rowNum = 1;
public int evaluate(Object... args) {
String columnValue[] = new String[args.length];
for (int i = 0; i < args.length; i++){
if (null == args[i]) {
columnValue[i] = "acorn_default";
} else {
columnValue[i] = args[i].toString();
}
}
if (rowNum == 1) {
for (int i = 0; i < columnValue.length; i++)
comparedColumn[i] = columnValue[i];
}
for (int i = 0; i < columnValue.length; i++) {
if (!comparedColumn[i].equals(columnValue[i])) {
for (int j = 0; j < columnValue.length; j++) {
comparedColumn[j] = columnValue[j];
}
rowNum = 1;
return rowNum++;
}
}
return rowNum++;
}
}