Hadoop-24、MapReduce并行机制，自定义InputFormat代码

最新推荐文章于 2020-06-01 19:11:49 发布

班德尔第一小法

最新推荐文章于 2020-06-01 19:11:49 发布

阅读量161

点赞数

分类专栏：大数据

本文链接：https://blog.csdn.net/qq_27906715/article/details/106425249

版权

大数据专栏收录该内容

30 篇文章 0 订阅

订阅专栏

在这里插入图片描述

one.txt

yongpeng weidong weinan
sanfeng luozong xiaoming

two.txt

longlong fanfan
mazong kailun yuhang yixin
longlong fanfan
mazong kailun yuhang yixin

three.txt

shuaige changmo zhenqiang 
dongli lingu xuanxuan

创建如下目录：
在这里插入图片描述
WholeFileDriver.class

package com.atguigu.inputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;

public class WholeFileDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(WholeFileDriver.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        job.setInputFormatClass(WholeFileInputFormat.class);
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        FileInputFormat.setInputPaths(job, new Path("d:\\input"));
        FileOutputFormat.setOutputPath(job,new Path("d:\\output"));

        boolean b = job.waitForCompletion(true);
        System.exit(b ? 0:1);
    }
}

WholeFileInputFormat.class

package com.atguigu.inputformat;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

/**
 * 自定义RR，处理一个文件，把这个文件直接读成一个KV值
 */

public class WholeFileInputFormat extends FileInputFormat<Text, BytesWritable> {

    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }

    @Override
    public RecordReader<Text, BytesWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        return new WholeFileRecordReader();
    }
}

WholeFileRecordReader.class

package com.atguigu.inputformat;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

public class WholeFileRecordReader extends RecordReader<Text, BytesWritable> {

    private boolean notRead = true;

    private Text key = new Text();
    private BytesWritable value = new BytesWritable();

    private FSDataInputStream inputStream;
    private FileSplit fs;

    /**
     * 初始化方法，框架会在开始的时候调用一次
     * @param split
     * @param context
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        // 转换切片类型的文件切片
        fs= (FileSplit) split;
        // 通过切片获取类型
        Path path = fs.getPath();
        // 通过路径获取文件系统
        FileSystem fileSystem = path.getFileSystem(context.getConfiguration());
        // 开流
        inputStream = fileSystem.open(path);
    }

    /**
     * 读取下一组KV值
     * 如果读到了，返回true，读完了返回False
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (notRead){
            // 具体读文件的过程
            // 读Key
            key.set(fs.getPath().toString());

            // 读Value
            byte[] buf = new byte[(int) fs.getLength()];
            inputStream.read(buf);
            value.set(buf, 0,buf.length);

            notRead = false;
            return true;
        }else{
            return false;
        }
    }

    /**
     * 获取当前读到的Key
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    /**
     * 获取当前的Value
     * @return
     * @throws IOException
     * @throws InterruptedException
     */

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    /**
     * 当前数据读取的进度
     * @return 当前进度
     * @throws IOException
     * @throws InterruptedException
     */

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return notRead?0:1;
    }

    @Override
    public void close() throws IOException {
        IOUtils.closeStream(inputStream);
    }
}

输出结果

SEQorg.apache.hadoop.io.Text"org.apache.hadoop.io.BytesWritable      ?凓怵焖?4RJ旲   L   file:/d:/input/one.txt   1yongpeng weidong weinan
sanfeng luozong xiaoming   N   file:/d:/input/three.txt   1shuaige changmo zhenqiang 
dongli lingu xuanxuan   u   file:/d:/input/two.txt   Zlonglong fanfan
mazong kailun yuhang yixin
longlong fanfan
mazong kailun yuhang yixin

班德尔第一小法

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Hadoop-24、MapReduce并行机制，自定义InputFormat代码

one.txtyongpeng weidong weinansanfeng luozong xiaomingtwo.txtlonglong fanfanmazong kailun yuhang yixinlonglong fanfanmazong kailun yuhang yixinthree.txtshuaige changmo zhenqiang dongli lingu xuanxuan创建如下目录：WholeFileDriver.classpa...
复制链接

扫一扫

专栏目录