自定义InputFormat

本文介绍了如何自定义InputFormat以解决Hadoop中处理小文件的问题。通过DiyInputFormat、DiyRecordReader、DiyMapper和DiyDriver的代码实现,将小文件合并成大文件,以克服因游标值类型限制导致的小文件处理局限。
摘要由CSDN通过智能技术生成

自定义InputFormat

自定义的InputFormat永来将小文件拼接成一个大文件,其中小文件的大小长度要小于 Integer.MAX 的大小,因为在网上学习到的例子文件读写的游标值类型为int类型,所以有限制

小文件处理方式有

hadoop har 方式打包小文件
CombineInputFormat
还有当前自定义的InputFormat

代码

DiyInputFormat
package com.xdc.diy;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

/**
 * @author xdc
 * created by 2019/11/7
 */
public class DiyInputFormat extends FileInputFormat {
    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return super.isSplitable(context, filename);
    }

    @Override
    public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        DiyRecordReader diyRecordReader = new DiyRecordReader();
        diyRecordReader.initialize(inputSplit, taskAttemptContext);
        return diyRecordReader;
    }
}
DiyRecordReader
package com.xdc.diy;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * @author xdc
 * created by 2019/11/7
 */
public class DiyRecordReader extends RecordReader<NullWritable, BytesWritable> {
    private long length = 0;
    private Configuration configuration;
    private FileSplit inputSplit;
    private BytesWritable value = new BytesWritable();
    /**
     * 每个maptask只会跑一次
     */
    private boolean isProcess = true;

    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        this.inputSplit = (FileSplit)inputSplit;
        //获取配置信息
        configuration = taskAttemptContext.getConfiguration();
    }

    /**
     * 获取下一处理键值对信息
     * */
    @Override
    public boolean nextKeyValue() {
        if (isProcess) {
            //获取文件路径地址信息
            Path path = inputSplit.getPath();
            //获取文件长度
            length = inputSplit.getLength();
            //获取文件系统信息
            FileSystem fs = null;
            FSDataInputStream inputStream = null;
            try {
                fs = FileSystem.get(configuration);
                //获取对应文件输入流信息
                inputStream = fs.open(path);
                //文件长度不能超过 Integer.Max 受文件读取游标长度限制
                byte[] bytes = new byte[(int) length];
                IOUtils.readFully(inputStream, bytes, 0, bytes.length);
                value.set(bytes, 0, bytes.length);
                fs.close();
                IOUtils.closeStream(inputStream);
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    if (inputStream != null) {
                        IOUtils.closeStream(inputStream);
                    }
                    if (null != fs) {
                       fs.close();
                    }

                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            isProcess = false;
            return true;
        }
        return false;
    }

    @Override
    public NullWritable getCurrentKey() throws IOException, InterruptedException {
        return NullWritable.get();
    }

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        //获取文件长度
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return isProcess ? 0 : 1;
    }

    @Override
    public void close() throws IOException {

    }
}
DiyMapper
package com.xdc.diy;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * @author xdc
 * created by 2019/11/7
 */
public class DiyMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> {

    private Text text = new Text();

    @Override
    protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {

        //获取切片信息
        FileSplit inputSplit = (FileSplit)context.getInputSplit();
        //获取key值信息
        String path = inputSplit.getPath().toString();
        context.write(text, value);
    }
}

DiyDriver
package com.xdc.diy;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;

/**
 * @author xdc
 * created by 2019/11/7
 */
public class DiyDriver {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        args = new String[]{"D:\\hadoopwork\\input", "D:\\hadoopwork\\output"};

        //创建配置信息
        Configuration conf = new Configuration();
        //创建人物信息
        Job job = Job.getInstance(conf);

        //添加mapper对象类型
        job.setJarByClass(DiyDriver.class);
        job.setMapperClass(DiyMapper.class);

        //设置输入类型信息
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        job.setInputFormatClass(DiyInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0 : 1);
    }


}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值