hadoop 之 将若干小文件打包成顺序文件

15 篇文章 0 订阅
4 篇文章 0 订阅

1.Why

在hadoop的世界里,处理少量的大文件比处理大量的小文件更加得心应手。

其中一个原因是FileInputFormat生成的分块是一个文件或该文件的一部分。如果文件很小(“小”意味着比HDFS的块要小的多),并且文件数量很多,那么每次map任务只处理很少的输入数据,(一个文件)就会有很多的map任务,每次map操作都会造成额外的开销。

请比较一下把1GB的文件分割成16个64MB的块与100KB的10000个块。10000个文件每个都需要使用一个map操作,作业时间比一个文件上的16个map操作慢几十倍甚至几百倍。

2.文件避免分片

有时,我们需要将整个文件传入一个Mapper中处理而不对其进行分片。
方法:
- 修改最小分片的大小,将它设置成大于要处理的文件的最大文件大小;(不推荐)
- 使用FileInputFormat 具体子类,并且重载 isSplittable()方法,把返回值设为false;
例如:

public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable>{

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }
}

3.打包小文件实例

定义自己的InputFormat:

package mapreduce.mr.inputformat;

import java.io.IOException;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

public class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable>{

    @Override
    public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context)
            throws IOException, InterruptedException {
        WholeFileRecordReader reader = new WholeFileRecordReader();
        reader.initialize(split, context);
        return reader;
    }

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }
}

定义自己的RecordReader:

package mapreduce.mr.inputformat;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable>{

    private FileSplit fileSplit;
    private Configuration conf;
    private BytesWritable value = new BytesWritable();
    private boolean processed = false;

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        this.fileSplit = (FileSplit) split;
        this.conf = context.getConfiguration();
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if(!processed)
        {
            byte[] contents = new byte[(int)fileSplit.getLength()];
            Path file = fileSplit.getPath();
            FileSystem fs = file.getFileSystem(conf);
            FSDataInputStream in = null;

            try{
                in = fs.open(file);
                IOUtils.readFully(in, contents, 0, contents.length);
                value.set(contents, 0, contents.length);
            } finally{
                IOUtils.closeStream(in);
            }
            processed = true;
            return true;
        }
        return false;
    }

    @Override
    public NullWritable getCurrentKey() throws IOException, InterruptedException {
        return NullWritable.get();
    }

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return processed ? 1.0f : 0.0f;
    }

    @Override
    public void close() throws IOException {

    }
}

将小文件打包成顺序文件的MapReduce程序:

package mapreduce.mr;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

import mapreduce.mr.inputformat.WholeFileInputFormat;

public class SmallFilesToSequenceFileConverter extends Configured implements Tool{


    private static class SequenceFileMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable>{
        private Text filenameKey;

        @Override
        protected void setup(Mapper<NullWritable, BytesWritable, Text, BytesWritable>.Context context)
                throws IOException, InterruptedException {
            InputSplit split = context.getInputSplit();
            Path path = ((FileSplit)split).getPath();
            filenameKey = new Text(path.toString());
        }

        @Override
        protected void map(NullWritable key, BytesWritable value,Context context)
                        throws IOException, InterruptedException {
            context.write(filenameKey, value);
        }
    }

    public int run(String[] args) throws Exception {
        Configuration conf = getConf();
        Job job = Job.getInstance(conf);
        job.setJobName("testSmallFilesToSequence");
        job.setJarByClass(SmallFilesToSequenceFileConverter.class);

        job.setInputFormatClass(WholeFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setMapperClass(SequenceFileMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path(args[1]));


        return job.waitForCompletion(true) ? 0:1;
    }

    public static void main(String[] args) throws Exception {
        int exitCode = ToolRunner.run(new SmallFilesToSequenceFileConverter(), args);
        System.exit(exitCode);
    }
}

运行时:

[root@hadoop1 tmp]# hadoop jar sortscore.jar mapreduce.mr.SmallFilesToSequenceFileConverter /tradeinfoIn*/* /bigfileOut

PS:/tradeinfoIn*/* 下有要处理的小文件,/bigfileOut 为处理结果的输出

处理的文件:(2个)
trade_info.txt

zhangsan@163.com    6000    0   2014-02-20
lisi@163.com    2000    0   2014-02-20
lisi@163.com    0   100 2014-02-20
zhangsan@163.com    3000    0   2014-02-20
wangwu@126.com  9000    0   2014-02-20
wangwu@126.com  0   200     2014-02-20

trade_info1.txt

zhangsan@163.com,6000,0,2014-02-20
lisi@163.com,2000,0,2014-02-20
lisi@163.com,0,100,2014-02-20
zhangsan@163.com,3000,0,2014-02-20
wangwu@126.com,9000,0,2014-02-20
wangwu@126.com,0,200,2014-02-20

处理结果:

SEQorg.apache.hadoop.io.Text"org.apache.hadoop.io.BytesWritable      i 珏.茱踇G晏8謼   ?  +*hdfs://cluster1/tradeinfoIn/trade_info.txt   葄hangsan@163.com,6000,0,2014-02-20
lisi@163.com,2000,0,2014-02-20
lisi@163.com,0,100,2014-02-20
zhangsan@163.com,3000,0,2014-02-20
wangwu@126.com,9000,0,2014-02-20
wangwu@126.com,0,200,2014-02-20   ?  ,+hdfs://cluster1/tradeinfoIn/trade_info1.txt   蓏hangsan@163.com   6000    0   2014-02-20
lisi@163.com    2000    0   2014-02-20
lisi@163.com    0   100 2014-02-20
zhangsan@163.com    3000    0   2014-02-20
wangwu@126.com  9000    0   2014-02-20
wangwu@126.com  0   200     2014-02-20

不知道为什会有那些乱码,需要研究一下。大体上已经把两个文件合并成一个文件了。

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值