目录
OutputFormat是Reduce之后的处理
1.OutputFormat接口实现类
2. 自定义OutputFormat案例实操
1)需求
过滤输入的log日志,包含atguigu的网站输出到e:/atguigu.log,不包含atguigu的网站输出到e:/other.log。
(1)输入数据:txt文件
http://www.baidu.com
http://www.google.com
http://cn.bing.com
http://www.atguigu.com
http://www.sohu.com
http://www.sina.com
http://www.sin2a.com
http://www.sin2desa.com
http://www.sindsafa.com
(2)期望输出数据
2)需求分析
3)代码
(1)编写LogMapper类
/**
* NullWritable作为占位
*/
public class LogMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//http://www.baidu.com
//http://www.google.com
//(http://www.google.com, NullWritable)
//不做任何处理
context.write(value, NullWritable.get());
}
}
(2)编写LogReducer类
public class LogReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//http://www.baidu.com
//http://www.baidu.com
//如果两个数据一样,直接输出会丢数据,只输出一条,因此用for循环防止丢数据
for (NullWritable value : values) {
context.write(key,NullWritable.get());
}
}
}
(3)自定义一个LogOutputFormat类
/**
* FileOutputFormat<>,<>里面是reduce的输出k,v
*/
public class LogOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException {
//创建一个自定义的RecordWriter返回
LogRecordWriter lrw = new LogRecordWriter(job);
return lrw;
}
}
(4)编写LogRecordWriter类
public class LogRecordWriter extends RecordWriter<Text, NullWritable> {
private FSDataOutputStream atguiguOut;
private FSDataOutputStream otherOut;
public LogRecordWriter(TaskAttemptContext job) {
//创建两条流
try {
FileSystem fs = FileSystem.get(job.getConfiguration());
atguiguOut = fs.create(new Path("D:\\code\\Hadoop\\atguigu.log"));
otherOut = fs.create(new Path("D:\\code\\Hadoop\\other.log"));
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
String log = text.toString();
//根据一行的log数据是否包含atguigu,判断两条输出流输出的内容
if(log.contains("atguigu")){
atguiguOut.writeBytes(log + "\n");
}else {
otherOut.writeBytes(log + "\n");
}
}
@Override
public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
//关流
IOUtils.closeStream(atguiguOut);
IOUtils.closeStream(otherOut);
}
}
(5)编写LogDriver类
public class LogDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance();
job.setJarByClass(LogDriver.class);
job.setMapperClass(LogMapper.class);
job.setReducerClass(LogReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//设置自定义的outputformat
job.setOutputFormatClass(LogOutputFormat.class);
FileInputFormat.setInputPaths(job,new Path("D:\\code\\Hadoop\\input\\inputoutputformat"));
//虽然我们自定义了outputformat,但是因为我们的outputformat继承自fileoutputformat
//而fileoutputformat要输出一个_SUCCESS文件,所以在这还得指定一个输出目录
FileOutputFormat.setOutputPath(job,new Path("D:\\code\\Hadoop\\output8"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}