前言
- 为了实现控制最终文件的输出路径和输出格式,可以通过自定义outputFormat来进行实现。
- 但是由于MR机制问题,驱动类里必须声明一个输出目录,而且必须是不存在的!否则在校验阶段无法提交job!
需求
读取一个txt文件,将每行包含zhengkw的合并到一个文件中,剩下的行合并到另一个文件中!
Mapper
package com.zhengkw.outputformat;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @ClassName:OutTxtMapper
* @author: zhengkw
* @description:
* @date: 20/02/27下午 4:05
* @version:1.0
* @since: jdk 1.8
*/
public class OutTxtMapper extends Mapper<LongWritable, Text, NullWritable, Text> {
// Text line = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
context.write(NullWritable.get(), value);
}
}
Reducer
package com.zhengkw.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @ClassName:OutTxtReducer
* @author: zhengkw
* @description:
* @date: 20/02/27下午 4:05
* @version:1.0
* @since: jdk 1.8
*/
public class OutTxtReducer extends Reducer<NullWritable, Text, Text, NullWritable> {
Text k = new Text();
@Override
protected void reduce(NullWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
for (Text value : values) {
k.set(value + "\r\n");
context.write(k, NullWritable.get());
}
}
}
OutTxtOutputFormat
package com.zhengkw.outputformat;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @ClassName:OutTxtOutputFormat
* @author: zhengkw
* @description:
* @date: 20/02/27下午 4:16
* @version:1.0
* @since: jdk 1.8
*/
public class OutTxtOutputFormat extends FileOutputFormat<Text, NullWritable> {
@Override
public RecordWriter getRecordWriter(TaskAttemptContext Context) throws IOException, InterruptedException {
OutTxtRecordWriter otrw = new OutTxtRecordWriter(Context);
return otrw;
}
}
OutTxtRecordWriter
package com.zhengkw.outputformat;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.io.IOException;
import static java.lang.System.out;
/**
* @ClassName:OutTxtRecordWriter
* @author: zhengkw
* @description:
* @date: 20/02/27下午 4:17
* @version:1.0
* @since: jdk 1.8
*/
public class OutTxtRecordWriter extends RecordWriter<Text, NullWritable> {
FileSystem fs = null;
Path zhengkw = new Path("f:/outputFormat/zhengkw.log");
Path other = new Path("f:/outputFormat/other.log");
FSDataOutputStream ofsz = null;
FSDataOutputStream ofso = null;
public OutTxtRecordWriter() {
super();
}
public OutTxtRecordWriter(TaskAttemptContext context) {
this();
//获取上下文中的conf信息创建一个文件系统对象
try {
fs = FileSystem.get(context.getConfiguration());
//打开输出流将内容分类写到2个路径下
ofsz = fs.create(zhengkw);
ofso = fs.create(other);
} catch (IOException e) {
e.printStackTrace();
out.println("没有找到配置信息");
}
}
@Override
public void write(Text text, NullWritable nullWritable) throws IOException, InterruptedException {
String contents = text.toString();
if (contents.contains(".zhengkw.")) {
ofsz.write(contents.getBytes());
} else {
ofso.write(contents.getBytes());
}
}
@Override
public void close(TaskAttemptContext Context) throws IOException, InterruptedException {
IOUtils.closeStream(ofsz);
IOUtils.closeStream(ofso);
}
}
驱动类
package com.zhengkw.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* @ClassName:OutTxtFormat
* @author: zhengkw
* @description:
* @date: 20/02/27下午 5:10
* @version:1.0
* @since: jdk 1.8
*/
public class OutTxtFormatDriver {
public static void main(String[] args) throws InterruptedException, IOException, ClassNotFoundException {
// 输入路径
Path inputPath = new Path("F:\\mrinput\\outputformat");
// 输出路径
Path outputPath = new Path("f:/nothing");
Configuration conf = new Configuration();
//判断输出路径是否已经存在 存在则删除
FileSystem fs = FileSystem.get(conf);
if (fs.exists(outputPath)) {
fs.delete(outputPath, true);
}
//用配置文件反射实例化job对象
Job job = Job.getInstance(conf);
// 2 设置jar加载路径
job.setJarByClass(OutTxtFormatDriver.class);
// 3 设置map和reduce类
job.setMapperClass(OutTxtMapper.class);
job.setReducerClass(OutTxtReducer.class);
// 4 设置map输出
job.setMapOutputValueClass(Text.class);
job.setMapOutputKeyClass(NullWritable.class);
// 5 设置最终输出kv类型
// job.setOutputKeyClass(Text.class);
//job.setOutputValueClass(IntWritable.class);
job.setOutputFormatClass(OutTxtOutputFormat.class);
// 6 设置输入和输出路径
FileInputFormat.setInputPaths(job, inputPath);
//必须设置 job提交时会检验文件夹是否存在,不存在则不会提交job
FileOutputFormat.setOutputPath(job, outputPath);
// 7 提交
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}