MapReduce 自定义FileOutPutForamt日志筛选和输出文件重命名
理论分析
本文采用自定义FileOutputFormat的方式重写getRecordWriter方法,自定义RecordWriter类实现日志的筛选和输出重命名。
(1)自定义一个类MyFileOutPutFormat继承FileOutputFormat
(2)重写getRecordWriter方法,调用自定义的MyRecordWriter类
(3)自定义一个类MyRecordWriter继承RecordReader来实现文件的读取
(4)自定义MyRecordWriter的构造函数实现输出文件流的创建
(5)重写RecordReader类的write和close方法实现筛选分类输出
案例
原日志文件log.txt
http://www.filter.com
http://www.sohu.com
http://www.sina.com
http://www.sin2a.com
http://www.sin2desa.com
http://www.sindsafa.com
http://www.filter.com
http://www.baidu.com
http://www.google.com
http://cn.bing.com
http://www.filter.com
http://www.sohu.com
http://www.sina.com
http://www.sin2a.comw.google.com
http://cn.bing.com
http://www.filter.com/29
http://www.sohu.com
http://www.sina.com
http://www.sin2a.com
http://www.sin2desa.comw.google.com
http://cn.bing.com
http://www.filter.com/39
http://www.sohu.com
http://www.sina.com
http://www.sin2a.com
http://www.sin2desa.com
http://www.sin2desa.com
http://www.sindsafa.com
http://www.filter.com
Map
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(value, NullWritable.get());
}
Reduce
Text k = new Text();
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
k.set(key.toString() + "\r\n");
context.write(k, NullWritable.get());
}
Dirver
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
args = new String[]{ "/Users/alu/workproject/Training/src/main/resources/define_in_out_format_homework/out/log.txt",
"/Users/alu/workproject/Training/src/main/resources/define_in_out_format_homework/out/current"
};
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(OutFormatDirver.class);
job.setMapperClass(OutFormatMap.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class)
job.setReducerClass(OutFormatReduce.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
Path path = new Path(args[1]);
FileSystem file = FileSystem.get(conf);
if (file.exists(path)) {
file.delete(path, true);
}
job.setOutputFormatClass(MyFileOutPutFormat.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// job.setPartitionerClass(MapReduceMyPartition.class);
// job.setNumReduceTasks(3);
//多个小文件时可以设置InputFormatClass用来合并小文件避免产生很多maptask,降低效率。
// job.setInputFormatClass(CombineTextInputFormat.class);
// CombineTextInputFormat.setMaxInputSplitSize(job, 4194304);
// CombineTextInputFormat.setMinInputSplitSize(job, 2097152);
job.waitForCompletion(true);
MyFileOutPutFormat
public class MyFileOutPutFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException {
return new MyRecordWriter(context);
}
}
MyRecordWriter
public class MyRecordWriter extends RecordWriter<Text, NullWritable> {
FSDataOutputStream filterOut = null;
FSDataOutputStream otherOut = null;
public MyRecordWriter(TaskAttemptContext context) {
Path outputPath = FileOutputFormat.getOutputPath(context);
try {
FileSystem fs = FileSystem.get(context.getConfiguration());
filterOut = fs.create(new Path(outputPath, "filter.log"));
otherOut = fs.create(new Path(outputPath, "other.log"));
} catch (Exception e) {
e.printStackTrace();
}
}
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
if (text.toString().contains("filter")) {
filterOut.write(text.toString().getBytes());
} else {
otherOut.write(text.toString().getBytes());
}
}
@Override
public void close(TaskAttemptContext context) throws IOException, InterruptedException {
if (filterOut != null) {
filterOut.close();
}
if (otherOut != null) {
otherOut.close();
}
}
生成筛选后的日志文件
filter.txt
http://www.filter.com
http://www.filter.com/29
http://www.filter.com/39
other.log
http://cn.bing.com
http://www.baidu.com
http://www.google.com
http://www.sin2a.com
http://www.sin2a.comw.google.com
http://www.sin2desa.com
http://www.sin2desa.comw.google.com
http://www.sina.com
http://www.sindsafa.com
http://www.sohu.com