为什么进行小文件合并?
大量的小文件会产生大量的map任务,增大map任务装载次数,而任务的装载比较耗时,从而导致 mr 运行较慢.
CombineTextInputFormat能够进行把小文件的合并一个文件
缺点无法区分数据是由哪个文件输出,进行对应处理,所以进行对CombineTextInputFormat重写
/**
* @author DK
* @version V1.0.0
* @date 2022/6/23 11:04
*/
public interface CombineSmallfileConstant {
String MAP_INPUT_FILE_PATH = "map.input.file.path";
String MAP_INPUT_FILE_NAME = "map.input.file.name";
}
package com.dk.combine;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.*;
import java.io.IOException;
/**
* 进行对CombineTextInputFormat 重写
*
* @author DK
* @version V1.0.0
* @date 2022/6/23 13:44
*/
public class CombineSmallTextInputFormat extends CombineTextInputFormat {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
return new CombineFileRecordReader<LongWritable, Text>((CombineFileSplit) split, context, CombineSmallTextInputFormat.TextRecordReaderWrapper.class);
}
private static class TextRecordReaderWrapper extends CombineFileRecordReaderWrapper<LongWritable, Text> {
private final int currentIndex;
// this constructor signature is required by CombineFileRecordReader
public TextRecordReaderWrapper(CombineFileSplit split, TaskAttemptContext context, Integer idx) throws IOException, InterruptedException {
super(new TextInputFormat(), split, context, idx);
//赋值文件下标
this.currentIndex = idx;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
CombineFileSplit combineFileSplit = (CombineFileSplit) split;
//根据文件下标定位到对应文件,设置到上下文中
context.getConfiguration().set(CombineSmallfileConstant.MAP_INPUT_FILE_PATH, combineFileSplit.getPath(currentIndex).getParent().getName());
context.getConfiguration().set(CombineSmallfileConstant.MAP_INPUT_FILE_NAME, combineFileSplit.getPath(currentIndex).getName());
super.initialize(split, context);
}
}
}
package com.dk.combine;
import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* @author DK
* @version V1.0.0
* @date 2022/6/17 16:35
*/
@Slf4j
public class CombineMapper extends Mapper<LongWritable, Text, Text, Text> {
@Override
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//1:判断数据来自哪个文件
String parentName = context.getConfiguration().get(CombineSmallfileConstant.MAP_INPUT_FILE_PATH);
String str = value.toString();
//todo 根据判断数据来对应的文件 进行对应数据操作处理
context.write(new Text(parentName), new Text(str));
}
}
package com.dk.combine;
import com.dk.mapreduce.common.Constant;
import com.dk.utils.DateUtil;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* @author DK
* @version V1.0.0
* @date 2022/6/23 16:12
*/
public class CombineReducer extends Reducer<Text,Text, Text,Text> {
@Override
protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
StringBuilder v = new StringBuilder();
for (Text value : values) {
v.append("\t").append(value);
}
//2:将K3和V3写入上下文中
context.write(key, new Text(v.toString()));
}
}
package com.dk.combine;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
/**
* @author DK
* @version V1.0.0
* @date 2022/6/17 16:35
*/
public class CombineJob extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//创建任务
Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());
//设置任务主类
job.setJarByClass(CombineJob.class);
//设置任务
job.setJobName("rentalSubsidyApplication-" + System.currentTimeMillis());
//设置Reduce的数量
job.setNumReduceTasks(1);
//设置数据的输入路径
job.setInputFormatClass(CombineSmallTextInputFormat.class);
CombineSmallTextInputFormat.addInputPaths(job, args[0]);
//设置输入最大分片
CombineSmallTextInputFormat.setMaxInputSplitSize(job,134217728);
//设置输入最小分片
CombineSmallTextInputFormat.setMinInputSplitSize(job,104857600);
//设置数据的输出路径
Path out = new Path(args[1]);
//删除目标目录
out.getFileSystem(this.getConf()).delete(out, true);
//设置输出目录
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, out);
//设置作业mapper阶段输出key value数据类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Text.class);
//设置作业reducer阶段输出key value数据类型 也就是程序最终输出数据类型
job.setOutputKeyClass(Text.class);;
job.setOutputValueClass(Text.class);
//设置Map和Reduce的处理类
job.setMapperClass(CombineMapper.class);
job.setReducerClass(CombineReducer.class);
//提交任务并等待执行完成
return job.waitForCompletion(true) ? 0 : 1;
}
}