pig脚本记录,对于pig脚本跑批处理

最近用到了一个写pig脚本来批处理统计生成日志文件,具体的来看看吧:

/*
nohup pig  -p INPUT=/staging/tracking/incoming/rtb.BJ.2015082516* -p SEID=9480  CheckIncomingData.pig &
*/

REGISTER pig-ext-1.0-SNAPSHOT.jar;
REGISTER buzzads-bidding-jobs-0.1-SNAPSHOT.jar;
REGISTER elephant-bird-hadoop-compat-4.1.jar;
REGISTER elephant-bird-core-4.1.jar;
REGISTER elephant-bird-pig-4.1.jar;

SET default_parallel 300;
SET mapreduce.job.queuename data;

DEFINE getField com.buzzinate.pig.udf.util.getField();

log = load '$INPUT' using PigStorage('\t') as (date:chararray, name:chararray, json:chararray);

--for rtb
--log_info = FOREACH log GENERATE getField(json, 'rtb_hash', 'opxseid') as seid;
--log_fil = FILTER  log_info BY (chararray)seid == '10060';

--for adgroup
--log_info = FOREACH log GENERATE getField(json, 'query_hash', 'opxseid') as seid, json;
--log_fil = FILTER  log_info BY (chararray)seid == '10009';

--for image
--log_info = FOREACH log GENERATE getField(json, 'query_hash', 'opxcreativeid') as creid, getField(json, 'query_hash', 'opxtype') as type, getField(json, 'query_hash', 'opxage') as age, getField(json, 'query_hash', 'opxgender') as gender;
--log_fil = FILTER  log_info BY (chararray)creid == '$CREID' and (chararray)type == '$TYPE';

--for event
/*
log_info = FOREACH log GENERATE getField(json, 'uuid') as event_id, json;

grp = GROUP log_info BY event_id;

log_fil = FOREACH grp{
	log = LIMIT log_info 1;
	GENERATE group as id, FLATTEN(log);
}
*/

--STORE log_fil INTO '$OUTPUT' USING PigStorage(',');

grp = GROUP log ALL;

grped = FOREACH grp GENERATE group, COUNT(log) as cnt;

res = order grped by cnt desc;

lim = LIMIT res 20;

DUMP lim;






  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值