第8章 MapReduce的类型与格式

最新推荐文章于 2023-04-06 23:29:27 发布

我的手在哪

最新推荐文章于 2023-04-06 23:29:27 发布

阅读量290

点赞数

分类专栏： hadoop权威指南文章标签：大数据

本文链接：https://blog.csdn.net/qq_41443987/article/details/105316523

版权

hadoop权威指南专栏收录该内容

4 篇文章 0 订阅

订阅专栏

1 MapReduce的类型

默认的MapReduce作业

示例一个最简单的MapReduce程序

/**
 * 简单一个文件转移
 */
public class example1 extends Configured implements Tool {
    public int run(String[] strings) throws Exception {
        if(strings.length != 2){
            return -1;
        }

        Job job = Job.getInstance(getConf());
        job.setJarByClass(getClass());
        FileInputFormat.addInputPath(job, new Path(strings[0]));
        FileOutputFormat.setOutputPath(job, new Path(strings[1]));
        return job.waitForCompletion(true)?0:1;
    }
   
    public static void main(String[] args){
        int exitcode = ToolRunner.run(new example1(),args);
        System.exit(exitcode);
    }
}

像上面这种，没有进行任何参数配置的MapReduce，默认参数如下：

job.setInputFormatClass(TextInputFormat.class);

job.setMapperClass(Mapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);

job.setPartitionerClass(HashPartitioner.class);//取key哈希与最大整数相与，对reduceNum取余。

job.setReducerClass(Reducer.class);
job.setNumReduceTasks(1);//每个reducer运行5分钟左右、且产生至少一个HDFS块的输出是比较合适的量

job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
job.setOutputFormatClass(TextOutputFormat.class);

2、输入格式

2.1输入分片与记录

一般来说，会对输入数据先进行分片，然后每个map操作处理一个输入分片。

输入分片在java中表示为InputSplit接口：

//开发人员不必直接处理InputSplit，因为它是由InputFormat创建的。

public abstract class InputSplit {
    //分片大小用来进行排序，以便优先处理最大的分片，从而最小化作业运行时间。
    public abstract long getLength() throws IOException, InterruptedException;

//存储位置供MapReduce系统使用以便将map任务尽量放在分片数据旁边
    public abstract String[] getLocations() throws IOException, InterruptedException;
}

(1)对于InputFormat接口：

public abstract class InputFormat<K, V> {

//客户端通过此方法计算分片.
    public abstract List<InputSplit> getSplits(JobContext var1) throws IOException, InterruptedException;
    //map任务将输入分片传给此方法获得RecodReader（相当于记录上的迭代器）,map任务就是用recodReader来生成键值对，然后再传给map函数。该过程可通过下面方法看见
    public abstract RecordReader<K, V> createRecordReader(InputSplit var1, TaskAttemptContext var2) throws IOException, InterruptedException;
}



//此方法为Mapper的run方法

public void run(Mapper<KEYIN, VALUEIN, KEYOUT, VALUEOUT>.Context context) throws IOException, InterruptedException {
    this.setup(context);
    try {

           //通过重复调用Context上的nextKeyValue()(委托给RecodReader的同名方法)为mapper产生键值对，传给map()函数处理
        while(context.nextKeyValue()) {
            this.map(context.getCurrentKey(), context.getCurrentValue(), context);
        }
    } finally {
        this.cleanup(context);
    }

}

(2) FileInputFornat

(3) CombineFileInputFormat：处理小文件。

(4) 避免切分：

1、将MinNumSize设置为比文件还要大的值。

2、继承FileInputFormat具体子类，并重写isSplitable()方法把返回值设置为false;

(5) mapper中的文件信息

Mapper可通过Context对象上的getInputSplit()方法获取分片有关信息，该方法返回的InputSplit可以被强制转换为一个 FileSplit，用来访问文件信息。

(6) 将整个文件作为一条记录处理（例子）

//演示如何将若干小文件打包成顺序文件
public class SmallFilesToSequenceFileConverter extends Configured implements Tool {
    static class SmallMapper extends Mapper<NullWritable, BytesWritable, Text,BytesWritable> {
        private Text filename;

        //初始化获取文件名
        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            InputSplit split = context.getInputSplit();
            Path path = ((FileSplit)split).getPath();
            filename = new Text(path.toString());
        }

        //将key设置为文件名，然后value不变
        @Override
        protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
            context.write(filename, value);
        }
    }

    public int run(String[] strings) throws Exception {
        Job job = Job.getInstance(getConf());
        job.setInputFormatClass(WholeFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        job.setJarByClass(SmallFilesToSequenceFileConverter.class);
        job.setMapperClass(SmallMapper.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        return job.waitForCompletion(true)?0:1;
    }
    public static void main(String[] args) throws Exception {
        int exitcode = ToolRunner.run(new SmallFilesToSequenceFileConverter(),args);
        System.exit(exitcode);
    }
}





//把整个文件作为一条记录的inputFormat
public class WholeFileInputFormat
        extends FileInputFormat<NullWritable, BytesWritable> {
    public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext Context) throws IOException, InterruptedException {
        //创建特殊的RecodReader
        WholeFileRecodReader reader = new WholeFileRecodReader();
        reader.initialize(inputSplit, Context);
        return reader;
    }

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        //取消分片
        return false;
    }

    //该RecordReader用来将整个文件读为一条数据
    private class WholeFileRecodReader extends RecordReader<NullWritable, BytesWritable> {
        private FileSplit fileSplit;
        private Configuration conf;
        private BytesWritable value = new BytesWritable();
        //用来记录记录是否被处理过
        private boolean process = false;

        public void initialize(InputSplit inputSplit, TaskAttemptContext context) {
            this.fileSplit = (FileSplit)inputSplit;
            this.conf = context.getConfiguration();
        }
        @Override
        public boolean nextKeyValue() throws IOException, InterruptedException {
            if(!process){
                byte[] contents = new byte[(int) fileSplit.getLength()];
                Path file = fileSplit.getPath();
                FileSystem fs = file.getFileSystem(conf);
                FSDataInputStream in = null;

                try{
                    in = fs.open(file);
                    IOUtils.readFully(in, contents, 0, contents.length);
                    value.set(contents,0,contents.length);
                }finally {
                    IOUtils.closeStream(in);
                }
                process = true;
                return true;
            }
            return false;
        }
        @Override
        public NullWritable getCurrentKey() throws IOException, InterruptedException {
            return NullWritable.get();
        }
        @Override
        public BytesWritable getCurrentValue() throws IOException, InterruptedException {
            return value;
        }
        @Override
        public float getProgress() throws IOException, InterruptedException {
            return this.process?1.0f:0.0f;
        }
        @Override
        public void close() throws IOException {
            //do nothing
        }
    }
}