需求:
将多个小文件合并成一个SequenceFile文件。SequenceFile里面存储着多个文件,存储的形式为文件路径+名称为key,文件内容为value。
实现步骤
1.自定义类继承FileInputFormat类。
2.重写RecordReader,实现一次性读取完一个文件并封装为k-v键值对。
3.输出时使用SequenceFileOutputFormat输出合并文件。
代码实现
1.自定义类继承FileInputFormat类。
package com.aura.hadoop.inputformat;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/**
* @author panghu
* @description 自定义文件输入流,可参考TextInputFormat类
* 现在的需求是将多个小文件合并成一个squencefile。key是文件路径+文件名,value是以二进制方式存储的文件内容
* @create 2021-02-14-21:42
*/
public class MyInputFormat extends FileInputFormat<Text,BytesWritable>{
@Override
public RecordReader<Text, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
return new MyRecordReader();
}
}
2.自定义类RecordReader。
package com.aura.hadoop.inputformat;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* @author panghu
* @description 自定义文件输入流的RecordReader类,用于将整个文件转化为一组k-v键值对
* @create 2021-02-14-21:48
*/
public class MyRecordReader extends RecordReader<Text, BytesWritable> {
private Text key = new Text();
private BytesWritable value = new BytesWritable();
// 标记文件是否已读
private boolean isRead = false;
private FileSplit fs;
FSDataInputStream inputStream;
/**
* 初始化方法
*
* @param inputSplit
* @param taskAttemptContext
* @throws IOException
* @throws InterruptedException
*/
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
// 开流,获取资源
fs = (FileSplit) inputSplit;
FileSystem fileSystem = FileSystem.get(taskAttemptContext.getConfiguration());
inputStream = fileSystem.open(fs.getPath());
}
/**
* 读取下一组的k-v键值对
*
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (!isRead) { //没有读过文件
key.set(fs.getPath().toString()); // key是文件路径+文件名
// 将文件内容读到缓冲区
byte[] buffer = new byte[(int) fs.getLength()];
int read = inputStream.read(buffer);
value.set(buffer, 0, buffer.length);
isRead = true;
return true;
} else {
return false;
}
}
/**
* 获取当前key
*
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public Text getCurrentKey() throws IOException, InterruptedException {
return key;
}
/**
* 获取当前value
*
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return value;
}
/**
* 获取文件读取进度
*
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public float getProgress() throws IOException, InterruptedException {
// 因为本类就是把整个文件转换为一组k-v的形式,所以读完了文件就是1,没读完就是0
return isRead ? 1 : 0;
}
/**
* 关闭资源
*
* @throws IOException
*/
@Override
public void close() throws IOException {
IOUtils.closeStream(inputStream);
}
}
3.因为Map端和Reduce端不对数据做任何处理,直接写出。我们可以省略Mapper类和Reducer类,直接写Driver类。
package com.aura.hadoop.inputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import java.io.IOException;
/**
* @author panghu
* @description
* @create 2021-02-14-22:30
*/
public class MyInputFormatDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Job job = Job.getInstance(new Configuration());
job.setJarByClass(MyInputFormatDriver.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
// 指定输入流、输出流类型
job.setInputFormatClass(MyInputFormat.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
FileInputFormat.setInputPaths(job, new Path("D:\\data\\hadoopdata\\自定义输入流\\"));
FileOutputFormat.setOutputPath(job, new Path("D:\\data\\myInputFormat_out"));
boolean b = job.waitForCompletion(true);
System.exit(b ? 0 : 1);
}
}