解决mapreduce 合并小文件,CombineTextInputFormat无法区分那个文件

为什么进行小文件合并?
大量的小文件会产生大量的map任务,增大map任务装载次数,而任务的装载比较耗时,从而导致 mr 运行较慢.
CombineTextInputFormat能够进行把小文件的合并一个文件
缺点无法区分数据是由哪个文件输出,进行对应处理,所以进行对CombineTextInputFormat重写

/**
 * @author DK
 * @version V1.0.0
 * @date 2022/6/23 11:04
 */
public interface CombineSmallfileConstant {
    String MAP_INPUT_FILE_PATH = "map.input.file.path";
    String MAP_INPUT_FILE_NAME = "map.input.file.name";
}
package com.dk.combine;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.*;

import java.io.IOException;

/**
 * 进行对CombineTextInputFormat 重写
 *
 * @author DK
 * @version V1.0.0
 * @date 2022/6/23 13:44
 */
public class CombineSmallTextInputFormat extends CombineTextInputFormat {

    @Override
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
        return new CombineFileRecordReader<LongWritable, Text>((CombineFileSplit) split, context, CombineSmallTextInputFormat.TextRecordReaderWrapper.class);
    }


    private static class TextRecordReaderWrapper extends CombineFileRecordReaderWrapper<LongWritable, Text> {
        private final int currentIndex;

        // this constructor signature is required by CombineFileRecordReader
        public TextRecordReaderWrapper(CombineFileSplit split, TaskAttemptContext context, Integer idx) throws IOException, InterruptedException {
            super(new TextInputFormat(), split, context, idx);
            //赋值文件下标
            this.currentIndex = idx;
        }

        @Override
        public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
            CombineFileSplit combineFileSplit = (CombineFileSplit) split;
            //根据文件下标定位到对应文件,设置到上下文中
            context.getConfiguration().set(CombineSmallfileConstant.MAP_INPUT_FILE_PATH, combineFileSplit.getPath(currentIndex).getParent().getName());
            context.getConfiguration().set(CombineSmallfileConstant.MAP_INPUT_FILE_NAME, combineFileSplit.getPath(currentIndex).getName());
            super.initialize(split, context);

        }
    }

}
package com.dk.combine;

import lombok.extern.slf4j.Slf4j;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author DK
 * @version V1.0.0
 * @date 2022/6/17 16:35
 */
@Slf4j
public class CombineMapper extends Mapper<LongWritable, Text, Text, Text> {


    @Override
    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1:判断数据来自哪个文件
        String parentName = context.getConfiguration().get(CombineSmallfileConstant.MAP_INPUT_FILE_PATH);
        String str = value.toString();
        //todo 根据判断数据来对应的文件 进行对应数据操作处理
        context.write(new Text(parentName), new Text(str));
    }
}
package com.dk.combine;

import com.dk.mapreduce.common.Constant;
import com.dk.utils.DateUtil;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @author DK
 * @version V1.0.0
 * @date 2022/6/23 16:12
 */
public class CombineReducer extends Reducer<Text,Text, Text,Text> {
    @Override
    protected void reduce(Text key, Iterable<Text> values, Reducer<Text, Text, Text, Text>.Context context) throws IOException, InterruptedException {
        StringBuilder v = new StringBuilder();
        for (Text value : values) {
            v.append("\t").append(value);
        }
        //2:将K3和V3写入上下文中
        context.write(key, new Text(v.toString()));
    }

}

package com.dk.combine;

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;

/**
 * @author DK
 * @version V1.0.0
 * @date 2022/6/17 16:35
 */
public class CombineJob extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        //创建任务
        Job job = Job.getInstance(this.getConf(),this.getClass().getSimpleName());
        //设置任务主类
        job.setJarByClass(CombineJob.class);
        //设置任务
        job.setJobName("rentalSubsidyApplication-" + System.currentTimeMillis());
        //设置Reduce的数量
        job.setNumReduceTasks(1);
        //设置数据的输入路径
        job.setInputFormatClass(CombineSmallTextInputFormat.class);
        CombineSmallTextInputFormat.addInputPaths(job, args[0]);
        //设置输入最大分片
        CombineSmallTextInputFormat.setMaxInputSplitSize(job,134217728);
        //设置输入最小分片
        CombineSmallTextInputFormat.setMinInputSplitSize(job,104857600);
        //设置数据的输出路径
        Path out = new Path(args[1]);
        //删除目标目录
        out.getFileSystem(this.getConf()).delete(out, true);
        //设置输出目录
        job.setOutputFormatClass(TextOutputFormat.class);
        TextOutputFormat.setOutputPath(job, out);
        //设置作业mapper阶段输出key value数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Text.class);
        //设置作业reducer阶段输出key value数据类型 也就是程序最终输出数据类型
        job.setOutputKeyClass(Text.class);;
        job.setOutputValueClass(Text.class);
        //设置Map和Reduce的处理类
        job.setMapperClass(CombineMapper.class);
        job.setReducerClass(CombineReducer.class);
        //提交任务并等待执行完成
        return job.waitForCompletion(true) ? 0 : 1;
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值