小文件合并操作
使用Hadoop自带CombineTextInputFormat类。在Driver类中设置上,并设置合并大小即可。
输入数据
输入目录中准备四个小文件
1.txt
test1
2.txt
test2 test2
3.txt
test3 test3 test3
4.txt
test4 test4 test4 test4 test4
自定义Driver类(WordCountDriver)
将 MapReduce中词频统计简单实现 中WordCountDriver改为如下代码,其他代码不变
package com.test.mapreduce.combinetTextDemo;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
// 1.创建配置信息Configuration对象并获取Job单例对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
// 2.设置关联本Driver程序的jar
job.setJarByClass(WordCountDriver.class);
// 3.设置关联Mapper和Reducer的jar
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
// 4.设置Mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
// 5. 设置最终输出的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 6.设置InputFormat切片类
job.setInputFormatClass(CombineTextInputFormat.class);
// 7.设置虚拟存储切片最大值(20M)
CombineTextInputFormat.setMaxInputSplitSize(job, 20971520);
// 8.设置输入和输出路径
FileInputFormat.setInputPaths(job, new Path("D:\\inputsmall"));
FileOutputFormat.setOutputPath(job, new Path("D:\\output"));
// 8.提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}