自定义InputFormat、自定义OutputFormat

1.自定义inputFormat合并小文件

需求

无论hdfs还是mapreduce,对于小文件都有损效率,实践中,又难免面临处理大量小文件的场景,此时,就需要有相应解决方案, 将多个小文件合并成一个文件 SequenceFile.SequenceFile 里面存储着多个文件。存储的形式为文件名称为 key,文件内容为 value。

分析

小文件的优化无非以下几种方式:

  1. 在数据采集的时候,就将小文件或小批数据合成大文件再上传HDFS
  2. 在业务处理之前,在HDFS上使用mapreduce程序对小文件进行合并
  3. 在mapreduce处理时,可采用combineInputFormat提高效率

实现

本节实现的是上述第二种方式

程序的核心机制:

自定义一个InputFormat

改写RecordReader,实现一次读取一个完整文件封装为KV

在输出时使用SequenceFileOutPutFormat输出合并文件

第一步:自定义InputFormat类

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

public class Custom_FileInputFormat extends FileInputFormat<NullWritable,BytesWritable> {
    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }

    @Override
    public RecordReader<NullWritable, BytesWritable> createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        Custom_RecordReader reader = new Custom_RecordReader();
        reader.initialize(inputSplit,taskAttemptContext);
        return reader;
    }
}

第二步:自定义RecordReader类

import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class Custom_RecordReader extends RecordReader<NullWritable,BytesWritable> {

    private FileSplit fileSplit;
    private Configuration conf;
    private BytesWritable bytesWritable = new BytesWritable();
    private boolean processed = false;

    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        fileSplit = (FileSplit) inputSplit;
        conf = taskAttemptContext.getConfiguration();
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if (!processed) {
            Path path = fileSplit.getPath();
            FileSystem fileSystem = null;
            FSDataInputStream inputStream = null;
            try {
                fileSystem = FileSystem.get(conf);
                //读取文件
                inputStream = fileSystem.open(path);
                //初始化一个字节数组,长度为读取内容的大小
                byte[] bytes = new byte[(int) fileSplit.getLength()];
                //读取到数组中
                IOUtils.readFully(inputStream, bytes, 0, bytes.length);
                //把字节数组转换成bytesWritable对象
                bytesWritable.set(bytes, 0, bytes.length);
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                fileSystem.close();
                inputStream.close();
            }
            processed = true;
            return true;
        }else {
            return false;
        }
    }

    @Override
    public NullWritable getCurrentKey() throws IOException, InterruptedException {
        return NullWritable.get();
    }

    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return bytesWritable;
    }

    @Override
    public float getProgress() throws IOException, InterruptedException {
        return processed?1.0F:0.0F;
    }

    @Override
    public void close() throws IOException {

    }
}

第三步:编写Mapper类

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

public class Custom_Mapper extends Mapper<NullWritable,BytesWritable,Text,BytesWritable> {
    @Override
    protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
        FileSplit fileSplit = (FileSplit) context.getInputSplit();
        String name = fileSplit.getPath().getName();
        context.write(new Text(name),value);


    }
}

 第四步:编写Driver主类:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;

public class Custom_Driver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(new Configuration());

        job.setInputFormatClass(Custom_FileInputFormat.class);
        TextInputFormat.addInputPath(job,new Path("E:\\自定义InputFormat\\input\\"));

        job.setMapperClass(Custom_Mapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        TextOutputFormat.setOutputPath(job,new Path("E:\\自定义InputFormat\\output\\"));
        return job.waitForCompletion(true)?0:1;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new Custom_Driver(),args);
    }
}

 

2.自定义OutputFormat

需求

现在有一些订单的评论数据,需求,将订单的好评与差评进行区分开来,将最终的数据分开到不同的文件夹下面去,数据内容参见资料文件夹,其中数据第九个字段表示好评,中评,差评。0:好评,1:中评,2:差评

分析

程序的关键点是要在一个mapreduce程序中根据数据的不同输出两类结果到不同目录,这类灵活的输出需求可以通过自定义outputformat来实现

实现

实现要点:

  1. 在mapreduce中访问外部资源
  2. 自定义outputformat,改写其中的recordwriter,改写具体输出数据的方法write()

第一步:自定义outputformat类

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Custom_OutputFormat extends FileOutputFormat<Text,NullWritable> {

    @Override
    public RecordWriter<Text, NullWritable> getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {

        Configuration conf = taskAttemptContext.getConfiguration();

        //获取fileSystem
        FileSystem fileSystem = FileSystem.get(conf);
        FSDataOutputStream fsDataOutputStream = fileSystem.create(new Path("M:\\自定义outputformat\\output\\good_cmment\\good_cmment.txt"));
        FSDataOutputStream fsDataOutputStream1 = fileSystem.create(new Path("M:\\自定义outputformat\\output\\bad_cmment\\bad_cmment.txt"));
        return new Custom_RecordWriter(fsDataOutputStream,fsDataOutputStream1); //调用自定义的RecordWriter的满参构造传递两个输出流
    }
}

第二步:自定义RecordWrite类

import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;

import java.io.IOException;

public class Custom_RecordWriter extends RecordWriter<Text,NullWritable> {

    FSDataOutputStream out1;
    FSDataOutputStream out2;

    public Custom_RecordWriter() {
    }

    public Custom_RecordWriter(FSDataOutputStream out1, FSDataOutputStream out2) {

        this.out1 = out1;
        this.out2 = out2;
    }

    public FSDataOutputStream getOut1() {

        return out1;
    }

    public void setOut1(FSDataOutputStream out1) {
        this.out1 = out1;
    }

    public FSDataOutputStream getOut2() {
        return out2;
    }

    public void setOut2(FSDataOutputStream out2) {
        this.out2 = out2;
    }

    @Override
    public void write(Text key, NullWritable value) throws IOException, InterruptedException {
        if (key.toString().split("\t")[9].equals("0")){
            //索引为9的是好评
            out1.write(key.toString().getBytes());
            out1.write("\r\n".getBytes());
        }else{
            //中评和差评
            out2.write(key.toString().getBytes());
            out2.write("\r\n".getBytes());
        }
    }

    @Override
    public void close(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        if (null != out1){
            out1.close();
        }
        if (null != out2){
            out2.close();
        }
    }
}

第三步:编写Mapper类

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class Custom_Mapper extends Mapper<LongWritable,Text,Text,NullWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        context.write(value,NullWritable.get());
    }
}

第四步:编写Driver主类

import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class Custom_OutputFormatDriver extends Configured implements Tool {

    public static void main(String[] args) throws Exception {
        System.exit(ToolRunner.run(new Custom_OutputFormatDriver(),args));
    }

    @Override
    public int run(String[] args) throws Exception {

        Job job = Job.getInstance(super.getConf());
        //设置mapper加载类
        job.setMapperClass(Custom_Mapper.class);
        //设置map输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);
        //设置InputFormat加载类
        job.setInputFormatClass(TextInputFormat.class);
        //设置自定义OutputFormat加载类
        job.setOutputFormatClass(Custom_OutputFormat.class);
        //设置读取和写出路径
        TextInputFormat.addInputPath(job,new Path("M:\\自定义outputformat\\input\\ordercomment.csv"));
        Custom_OutputFormat.setOutputPath(job,new Path("M:\\自定义outputformat\\output\\tmp"));
        return job.waitForCompletion(true)?0:1;
    }
}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值