自定义 InputFormat--小文件合并_whole合并小文件-CSDN博客

本文链接：https://blog.csdn.net/qq_28844767/article/details/80491464

实现思路：

1、编写自定义的 InputFormat

2、改写 RecordReader，实现一次 maptask 读取一个小文件的完整内容封装了一个 KV 对

3、在 Driver 类中一定要设置使用自定义的

InputFormat：job.setInputFormatClass(WholeFileInputFormat.class)

编写 MapReduce 程序:

package mapreduce.format.inputformat;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;

/**
 *  通过自定义InputFormat合并小文件 这里通过extends Configured implements Tool
 * 	改写mapreduce的运行方式，是mr程序的另外一种运行方式
 *  
 */


import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class SmallFilesConvertToBigMR extends Configured implements Tool {

	public static void main(String[] args) throws Exception {
		int exitCode = ToolRunner.run(new SmallFilesConvertToBigMR(), args);
		System.exit(exitCode);
	}

	@Override
	public int run(String[] args) throws Exception {

		Configuration conf = new Configuration();
//		conf.addResource("hdfs_config/core-site.xml");
//		conf.addResource("hdfs_config/hdfs-site.xml");
		
		conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
		System.setProperty("HADOOP_USER_NAME", "hadoop");
		
//		在创建job的时候为job指定job的名称
		Job job = Job.getInstance(conf,"combine small files to bigfile");
		
		job.setJarByClass(SmallFilesConvertToBigMR.class);
		
		job.setMapperClass(SmallFilesConvertToBigMRMapper.class);
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(Text.class);
		
		job.setReducerClass(SmallFilesConvertToBigMRReducer.class);
		job.setOutputKeyClass(NullWritable.class);
		job.setOutputValueClass(Text.class);
		
//		TextInputFormat是默认的数据读取组件
//		job.setInputFormatClass(TextInputFormat.class);
		
//		不是用默认的读取数据的Format，使用自定义的WholeFileInputFormat
		job.setInputFormatClass(WholeFileInputFormat.class);
		
		job.setNumReduceTasks(0);

//		job.setSortComparatorClass();
//		job.setGroupingComparatorClass();
		
		Path input = new Path(args[0]);
		Path output = new Path(args[1]);
		FileInputFormat.setInputPaths(job, input);
		FileSystem fs = FileSystem.get(conf);
		if(fs.exists(output)){
			fs.delete(output,true);
		}
		FileOutputFormat.setOutputPath(job, output);
		
		int status = job.waitForCompletion(true) ? 0 : 1;
		return status;
		
	}
	
	
	
	private static class SmallFilesConvertToBigMRMapper extends Mapper<NullWritable, Text, Text, Text>{
		
		private Text filenameKey;

//		在setup方法里获取filenamekey,因为setup方法是在map方法之前执行的
		@Override
		protected void setup(Context context){
				InputSplit split = context.getInputSplit();
				Path path = ((FileSplit)split).getPath();
				filenameKey = new Text(path.toString());
		
		
		}

//		根据自定义的InputFormat的逻辑，map方法每执行一次，实际是获取到了整个文件切片的内容
		@Override
		protected void map(NullWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			context.write(filenameKey, value);
		}
	}
	
	static class SmallFilesConvertToBigMRReducer extends Reducer<Text, Text, NullWritable, Text>{

		@Override
		protected void reduce(Text filename, Iterable<Text> bytes, Context context)
				throws IOException, InterruptedException {
			
			context.write(NullWritable.get(),bytes.iterator().next());
			
			
		}
				
	}
	
}

因为不是用默认的读取数据的Format，使用自定义的WholeFileInputFormat：

package mapreduce.format.inputformat;
/**
 * 自定义输出 
 *
 */

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeFileRecordReader extends RecordReader<NullWritable, Text> {

	private FileSplit fileSplit;
	private Configuration conf;
	private Text value = new Text();
//	进度
	private boolean processed = false;
	
	@Override
	public void initialize(InputSplit split, TaskAttemptContext context) throws 
IOException, InterruptedException {

		this.fileSplit = (FileSplit)split;
		this.conf = context.getConfiguration();

	}
	
//  nextKeyValue()方法是RecordReader最重要的方法，也就是RecordReader读取文件的读取逻辑所在


地
//	所以我们要自定义RcordReader,就需要重写nextKeyValue()的实现
	@Override
	public boolean nextKeyValue() throws IOException, InterruptedException {
		
//		只有当还没有读取完毕的时候，才进行数据的读取
		if(! processed){
			
//			创建一个输入切片的字节数组,用来存储将要读取的数据内容
			byte[] contents = new byte[(int)fileSplit.getLength()];
			
//			当前maptask读取到的一个完整的一个小文件的路径地址
//			通过fileSplit获取该逻辑切片在文件系统的位置
			Path file = fileSplit.getPath();
			
//			通过该file对象获取该切片所在的文件系统
			FileSystem fs = file.getFileSystem(conf);
			FSDataInputStream in = null;
			
//			接下来做的事情就是把整个文件的内容当做value写出去
//			文件系统对象fs打开一个file的输入流
			in = fs.open(file);
//			in是输入流,contents是存这个流读取到数的数据的字节数组
			IOUtils.readFully(in, contents, 0, contents.length);
			
//			最后把读到的数据封装到value里面，value就是最后传入map方法执行的


key-value的value
			value.set(contents, 0, contents.length);
			
//			采用hadoop提供的工具关闭流
			IOUtils.closeStream(in);
			
			processed = true;
//			表示读取数据的时候还有没有下一个。 有 返回 true
			return true;
		}
		
//		下一次再读就没有了。因为一次性的全部读入到value里了。
		return false;
		
	}


	@Override
	public NullWritable getCurrentKey() throws IOException, InterruptedException {
		
		return NullWritable.get();
	}


	@Override
	public Text getCurrentValue() throws IOException, InterruptedException {
		
		return value;
	}


//	获取任务执行的进度
	@Override
	public float getProgress() throws IOException, InterruptedException {
		
		return processed ? 1.0f : 0.0f;
	}


	@Override
	public void close() throws IOException {
//		do nothing
		
	}


}