MR自定义输出

最新推荐文章于 2021-02-28 17:36:41 发布

雲小妖

最新推荐文章于 2021-02-28 17:36:41 发布

阅读量238

点赞数

分类专栏：大数据文章标签： hadoop 自定义输出 MR

本文链接：https://blog.csdn.net/fengjinghong/article/details/109248911

版权

大数据专栏收录该内容

13 篇文章 1 订阅

订阅专栏

源数据（aa文件夹下）：

1、one.txt

yongpeng weidong weinan
sanfeng luozong xiaoming

2、tow.txt

longlong fanfan
mazong kailun yuhang yixin
longlong fanfan
mazong kailun yuhang yixin

3、three.txt

shuaige changmo zhenqiang 
dongli lingu xuanxuan

1、MyRecordReader.java

package com.fjh.myinputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;

import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * 用户： Administrator 项目名：hadoop 时间：2020/10/21 17:01
 */
public class MyRecordReader extends RecordReader<Text, BytesWritable> {

    private Configuration configuration;
    private FileSplit split;

    private boolean isprogress = true;
    private BytesWritable value = new BytesWritable();
    private Text k = new Text();

    //重写 initialize
    @Override
    public void initialize(InputSplit split, TaskAttemptContext taskAttemptContext) {

        this.split=(FileSplit) split;
        configuration = taskAttemptContext.getConfiguration();
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {

        if (isprogress){
            //1 定义缓存区
            byte[] bytes = new byte[(int) split.getLength()];

            FSDataInputStream fsDataInputStream = null;
            try {
                //2 获取文件系统
                Path path = split.getPath();
                FileSystem fileSystem = path.getFileSystem(configuration);
                //3 读取数据
                fsDataInputStream = fileSystem.open(path);
                //4 读取文件内容

                IOUtils.readFully(fsDataInputStream,bytes,0,bytes.length);

                //5 输出文件内容
                value.set(bytes,0,bytes.length);

                //6 获取文件内容
                String name = split.getPath().toString();

                //7 设置输出的 key 值
                k.set(name);
            }catch (Exception e){

            }finally {
                IOUtils.closeStream(fsDataInputStream);
            }
            isprogress = false;

            return true;
        }
        return false;

    }

    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return k;
    }
    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return value;
    }
    @Override
    public float getProgress() throws IOException, InterruptedException {
        return 0;
    }
    @Override
    public void close() throws IOException {

    }
}

2、MyInputFormat.java

package com.fjh.myinputformat;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

/**
 * 用户： Administrator 项目名：hadoop 时间：2020/10/21 16:33
 */
public class MyInputFormat extends FileInputFormat<Text, BytesWritable> {

    public RecordReader<Text, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        MyRecordReader recordReader = new MyRecordReader();
        recordReader.initialize(inputSplit,taskAttemptContext);
        return recordReader;
    }

    //设置是否可切分 false 否
    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }
}

3、SequenceFileMapper.java

package com.fjh.myinputformat;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * 用户： Administrator 项目名：hadoop 时间：2020/10/21 19:22
 */
public class SequenceFileMapper extends Mapper<Text, BytesWritable,Text,BytesWritable> {

    @Override
    protected void map(Text key, BytesWritable value, Context context) throws IOException, InterruptedException {
        context.write(key, value);
    }
}

4、SequenceFileReducer.java

package com.fjh.myinputformat;

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * 用户： Administrator 项目名：hadoop 时间：2020/10/21 19:30
 */
public class SequenceFileReducer extends Reducer<Text, BytesWritable,Text,BytesWritable> {
    @Override
    protected void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException {
        context.write(key,values.iterator().next());
    }
}

5、SequenceFileDriver.java

package com.fjh.myinputformat;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

import java.io.IOException;

/**
 * 用户： Administrator 项目名：hadoop 时间：2020/10/21 19:46
 */
public class SequenceFileDriver  {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //获取配置
        Configuration conn = new Configuration();
        Job job = Job.getInstance(conn);

        //设置jar包存储位置，关联自定义的mapper和reducer
        job.setJarByClass(SequenceFileDriver.class);
        job.setMapperClass(SequenceFileMapper.class);
        job.setReducerClass(SequenceFileReducer.class);

        //设置输入的inputFormat
        job.setInputFormatClass(MyInputFormat.class);
        //设置输出的outputFormat
        job.setOutputFormatClass(SequenceFileOutputFormat.class);

        //设置map输出端的kv类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        //设置最终输出端的kv类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        //设置路径，包括了输入文件和输出路径

        FileInputFormat.setInputPaths(job,new Path("hdfs://bigdata01:9000/input/aa"));
        //输出路径的文件夹不能存在，若存在，则报错。
        FileOutputFormat.setOutputPath(job,new Path("src/main/resources/output/myformat"));

        if(!job.waitForCompletion(true)){
            return;
        }

    }
}