package com.cn.demo_xwjhb; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import java.io.IOException; public class MyInputFormat extends FileInputFormat<NullWritable, BytesWritable> { /** * 目的:实现小文件的合并,合并成一个二进制文件 * 返回值代表文件是否分割,因为要合并小文件,不分割 */ @Override protected boolean isSplitable(JobContext context, Path filename) { return false; } /** * 读取的时候主要使用此程序,需要返回重写的RecordReader * @param inputSplit 切片 * @param taskAttemptContext 上下文 * @return * @throws IOException * @throws InterruptedException */ @Override public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { MyRecordReader reader = new MyRecordReader(); reader.initialize(inputSplit,taskAttemptContext); return reader; } }
package com.cn.demo_xwjhb; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; import java.io.IOException; public class MyRecordReader extends RecordReader<NullWritable, BytesWritable> { private FileSplit fileSplit; private Configuration configuration; private BytesWritable value = new BytesWritable(); private boolean processed = false; /** * RecordReader的核心工作逻辑: * 通过nextKeyValue()方法去读取数据构造将返回的key value,主要重写这个方法的逻辑 * 通过getCurrentKey 和 getCurrentValue来返回上面构造好的key和value * */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { this.fileSplit = (FileSplit) inputSplit; this.configuration = taskAttemptContext.getConfiguration(); } /** * 获取K1 V1的逻辑 * 定义一个数组,长度必须能够放下一整个切片 * 利用IOUtils工具类的readFully 传入FSDataInputStream流 读入数据 * 结束后关闭流 * @return * @throws IOException */ @Override public boolean nextKeyValue() throws IOException { if(!processed){ byte[] contents = new byte[(int) fileSplit.getLength()]; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(configuration); FSDataInputStream in = null; try { in = fs.open(file); //IOUtils有两个,保持统一去使用即可 IOUtils.readFully(in,contents,0,contents.length); value.set(contents,0,contents.length); } catch (IOException e) { e.printStackTrace(); }finally { IOUtils.closeStream(in); } processed = true; return true; } return false; } /** * 返回k1 */ @Override public NullWritable getCurrentKey() throws IOException, InterruptedException { return NullWritable.get(); } /** * 返回V1 */ @Override public BytesWritable getCurrentValue() throws IOException, InterruptedException { return value; } /** * 获取文件进度的东西,没什么用 */ @Override public float getProgress() throws IOException, InterruptedException { return processed?1.0f:0.0f; } @Override public void close() throws IOException { } }
---------------------------------主程序二进制文件输出-------------------------
job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job,new Path("file:///D:\\dsj\\baishi课件\\hadoop\\5、大数据离线第五天\\5、大数据离线第五天\\自定义inputformat_小文件合并\\output"));