以Apache 日志为例。
创建加载器
@Override
public Tuple getNext() throws IOException {
tuple = new ArrayList<Object>(11);
for (int i = 0; i < 11; i++) {
tuple.add(null);
}
try {
// 如果recordReader 读取到input split 的末端,则返回null
if (!in.nextKeyValue()) {
return null;
}
setTuple(in.getCurrentValue());
return factory.newTupleNoCopy(tuple);
} catch (InterruptedException e) {
int errCode = 6018;
String errMsg = "Error while reading input";
throw new ExecException(errMsg, errCode,
PigException.REMOTE_ENVIRONMENT, e);
}
}
// 设置 tuple
private void setTuple(CommonLogWritable entry) throws IOException {
tuple.set(0, entry.getRemoteAddress());
tuple.set(1, entry.getRemoteLogname());
tuple.set(2, entry.getUserId());
tuple.set(3, entry.getTime());
tuple.set(4, entry.getRequestLine());
tuple.set(5, entry.getStatusCode());
tuple.set(6, entry.getObjSize());
tuple.set(7, entry.getMethod());
tuple.set(8, entry.getResource());
tuple.set(9, entry.getProtocol());
tuple.set(10, entry.getEpoch());
}
我们希望加载器可以指定字段的类型信息,这里需要实现LoadMetadata接口,并提供字段名称和相关信息的有序列表。
@Override
public ResourceSchema getSchema(String location, Job job) throws IOException {
return new ResourceSchema(new Schema(
Arrays.asList(
new Schema.FieldSchema(CommonLogLoaderConstants.REMOTE_ADDR, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.REMOTE_LOGNAME, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.USERID, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.TIME, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.REQUEST_LINE, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.STATUS_CODE, DataType.LONG),
new Schema.FieldSchema(CommonLogLoaderConstants.OBJ_SIZE, DataType.LONG),
new Schema.FieldSchema(CommonLogLoaderConstants.METHOD, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.RESOURCE, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.PROTOCOL, DataType.CHARARRAY),
new Schema.FieldSchema(CommonLogLoaderConstants.EPOCH, DataType.LONG)
)));
}
pig脚本:
REGISTER pig.jar;
REGISTER geoip-api-1.2.14.jar;
DEFINE LogLoader com.hadoop2.pig.CommonLogLoader();
logs = LOAD 'access.log' USING LogLoader;
grpd = GROUP logs BY statusCode;
/** 统计 请求失败的次数 */
cntd = FOREACH grpd GENERATE group,COUNT(logs);
/** 减少字段处理 */
projected_logs = FOREACH logs GENERATE remoteAddr,statusCode,resource;
ip_group = GROUP projected_logs BY (remoteAddr,statusCode);
addrstatus_count = FOREACH ip_group GENERATE FLATTEN(group),COUNT(projected_logs);
DEFINE GeoIP com.hadoop2.pig.PigGeolocationUDF();
countries = FOREACH addrstatus_count GENERATE *,GeoIP(remoteAddr);
dump countries;
输出结果:
(10.0.1.75,302,2)
(10.0.1.91,200,12)
(10.0.1.91,301,2)
(10.0.5.17,200,114)
(10.0.5.17,301,2)
(10.0.5.17,302,2)
(10.0.5.17,503,2)
(10.0.5.50,200,8)
(10.0.5.78,200,74)
(10.0.5.91,200,26)
(10.0.5.92,200,58)
(10.0.5.92,503,8)
(10.0.5.173,200,12)
(10.0.5.211,200,62)
(10.0.6.168,200,8)
(10.0.6.193,200,54)
(10.0.6.196,200,16)
(10.0.6.197,200,636)
(10.0.6.247,200,6)
(172.16.1.36,200,2380)
(172.16.1.36,503,64)
由于我的日志都是来源于内网,所以最后一步国家统计的未执行!