最近用到了一个写pig脚本来批处理统计生成日志文件,具体的来看看吧:
/*
nohup pig -p INPUT=/staging/tracking/incoming/rtb.BJ.2015082516* -p SEID=9480 CheckIncomingData.pig &
*/
REGISTER pig-ext-1.0-SNAPSHOT.jar;
REGISTER buzzads-bidding-jobs-0.1-SNAPSHOT.jar;
REGISTER elephant-bird-hadoop-compat-4.1.jar;
REGISTER elephant-bird-core-4.1.jar;
REGISTER elephant-bird-pig-4.1.jar;
SET default_parallel 300;
SET mapreduce.job.queuename data;
DEFINE getField com.buzzinate.pig.udf.util.getField();
log = load '$INPUT' using PigStorage('\t') as (date:chararray, name:chararray, json:chararray);
--for rtb
--log_info = FOREACH log GENERATE getField(json, 'rtb_hash', 'opxseid') as seid;
--log_fil = FILTER log_info BY (chararray)seid == '10060';
--for adgroup
--log_info = FOREACH log GENERATE getField(json, 'query_hash', 'opxseid') as seid, json;
--log_fil = FILTER log_info BY (chararray)seid == '10009';
--for image
--log_info = FOREACH log GENERATE getField(json, 'query_hash', 'opxcreativeid') as creid, getField(json, 'query_hash', 'opxtype') as type, getField(json, 'query_hash', 'opxage') as age, getField(json, 'query_hash', 'opxgender') as gender;
--log_fil = FILTER log_info BY (chararray)creid == '$CREID' and (chararray)type == '$TYPE';
--for event
/*
log_info = FOREACH log GENERATE getField(json, 'uuid') as event_id, json;
grp = GROUP log_info BY event_id;
log_fil = FOREACH grp{
log = LIMIT log_info 1;
GENERATE group as id, FLATTEN(log);
}
*/
--STORE log_fil INTO '$OUTPUT' USING PigStorage(',');
grp = GROUP log ALL;
grped = FOREACH grp GENERATE group, COUNT(log) as cnt;
res = order grped by cnt desc;
lim = LIMIT res 20;
DUMP lim;