一自定义InputFormat合并小文件
1.1需求
无论是HDFS还是mapreduce,对于小文件都有损效率。实践中又难免面临处理大量小文件的场景
1.2分析
小文件的优化方式
- 在数据采集的时候,就将小文件合成大文件再上传HDFS
- 在业务处理之前,在HDFS上使用MapReduce程序对小文件进行合并
- 在mapreduce处理时,可采用combineInputFormat提高效率
1.3实现
这里我们实现第二种合并方式
我们自定义一个InputFormat继承FileInputFormat类
我们需要自己实现一个MyRecordReader类定义文件读取规则
public class MyRecordReader extends RecordReader<NullWritable, BytesWritable> {
private Configuration configuration = null;
private FileSplit fileSplit = null;
private boolean processed = false;
private BytesWritable bytesWritable = new BytesWritable();
private FileSystem fileSystem = null;
private FSDataInputStream inputStream = null;
@Override
//初始化
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
//获取文件的切片
fileSplit= (FileSplit)inputSplit;
//获取Configuration对象
configuration = taskAttemptContext.getConfiguration();
}
@Override
//获取k1v1
public boolean nextKeyValue() throws IOException, InterruptedException {
if(!processed){
//1:获取源文件输入流
fileSystem = FileSystem.get(configuration);
inputStream = fileSystem.open(fileSplit.getPath());
//2.读取源文件数据到普通的字节数组
byte[] bytes = new byte[(int) fileSplit.getLength()];
IOUtils.readFully(inputStream, bytes, 0, (int)fileSplit.getLength());
//3.将字节数组中数据封装到BytesWritable得到v1
bytesWritable.set(bytes, 0, (int)fileSplit.getLength());
processed = true;
return true;
}
return false;
}
@Override
//返回k1
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
@Override
//返回v1
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return bytesWritable;
}
@Override
//获取文件读取的进度
public float getProgress() throws IOException, InterruptedException {
return 0;
}
@Override
//资源是否
public void close() throws IOException {
inputStream.close();
fileSystem.close();
}
}
定义MyInputForMat方法,指定RecordReader类
public class MyInputFormat extends FileInputFormat<NullWritable, BytesWritable> {
@Override
//我们需要自定义一个RecordReader
public RecordReader<NullWritable,
BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext)
throws IOException, InterruptedException {
//1:创建自定义RecordReader对象
MyRecordReader myRecordReader = new MyRecordReader();
//2:
// 将inputSplit和context对象传给MyRecordReader
myRecordReader.initialize(inputSplit, taskAttemptContext);
return myRecordReader;
}
@Override
//设置文件是否被切割
//由于我们是把小文件合并成一个,这里我们设置不切割
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
}
定义Map类,把读取的文件合并
public class SequenceFileMapper extends Mapper<NullWritable,BytesWritable,Text,BytesWritable> {
@Override
protected void map(NullWritable key, BytesWritable value, Context context)
throws IOException, InterruptedException {
//1:获取文件的名字,作为K2
FileSplit fileSplit = (FileSplit) context.getInputSplit(); String fileName = fileSplit.getPath().getName();
//2:将K2和V2写入上下文中
context.write(new Text(fileName), value); } }
创建job时,要注意设置的输出类是setOutputFormatClass
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job, new Path("file:///D:\\out\\myinputformat_out"));