lzo的设置:
set mapred.output.compress=true;
set hive.exec.compress.output=true;set mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec;
set io.compression.codecs=com.hadoop.compression.lzo.LzopCodec;
执行脚本如下:
#!/bin/bash
day=$1
bash="./"
running="首焦统计上游数据click_orc is not ready"
partitionNumsOfDay=`hive -e "show partitions click_orc" | grep day=$day/hour= | wc -l`
sql="
set mapred.output.compress=true;
set hive.exec.compress.output=true;
set mapred.output.compression.codec=com.hadoop.compression.lzo.LzopCodec;
set io.compression.codecs=com.hadoop.compression.lzo.LzopCodec;
INSERT OVERWRITE TABLE app.clicks_statis partition (dt='$day')
SELECT
advertise_pin,
CASE WHEN mobile_type == 0 THEN 'PC'
ELSE '无线'
END as mobile_type,
pos_id,
ad_position as pos_name,
to_url,
to_url_clicks
FROM (SELECT advertise_pin,
pos_id,
mobile_type,
regexp_extract(to_url, '(.*)(\\\?|\\\&){1}(jj_pop)', 1) as to_url,
count(to_url) as to_url_clicks
FROM click_orc
where day = '$day'
AND pos_id in (1,2,3)
group by advertise_pin, pos_id, mobile_type,regexp_extract(to_url, '(.*)(\\\?|\\\&){1}(jj_pop)', 1)) click_orc
JOIN (SELECT id, ad_position
FROM media_chain
WHERE dp = 'ACTIVE'
AND id in (1,2,3)) media
ON click_orc.pos_id = media.id;"
if [ $partitionNumsOfDay -eq 24 ]; then
hive -e "$sql"
else
println("$running")
fi
建表语句如下:
create external table clicks_statis(
advertise_pin string comment '广告主pin '
,mobile_type string comment '点击设备类型'
,pos_id bigint comment '广告位id'
,pos_name string comment '广告位名称'
,to_url string comment ''
,to_url_clicks bigint comment '点击次数'
) comment '首焦落地页点击次数统计'
partitioned by(dt string)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES
('field.delim' = '\t',
'serialization.null.format' = ''
)STORED AS INPUTFORMAT "com.hadoop.mapred.DeprecatedLzoTextInputFormat"
OUTPUTFORMAT "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"
;