17 自定义输入输出格式以及源码分析

最新推荐文章于 2022-07-10 17:15:49 发布

qq_34352013

最新推荐文章于 2022-07-10 17:15:49 发布

阅读量245

点赞数

分类专栏： Reduce hadoop

本文链接：https://blog.csdn.net/qq_34352013/article/details/104949683

版权

hadoop 同时被 2 个专栏收录

33 篇文章 0 订阅

订阅专栏

Reduce

20 篇文章 1 订阅

订阅专栏

输入格式InputFormat

1、InputFormat是MapReduce中提供输入格式化的顶级父类，所有的输入格式化都继承与这个父类。其中提供了两个主要方法：
getSplits：规定切片过程
createRecordReader：提供输入流来读取切片中的数据
2、InputFormat发生在Map之前，在MapTask之前，先对文件切片，然后通过输入IO流读取数据，将读取的数据发送给MapTask，也就意味着InputFormat读出来的数据是什么格式，那么Mapper接收的数据就是什么格式
3、在默认情况下，输入格式类用的TextInputFormat，默认是按行读取，然后将读取的数据发送给Map。在TextInputFormat中，为了保证数据的完整性，每一个MapTask都是从当前切片的第二行开始，处理到下一个切片的第一行（下一个vv切片的第一行需要通过网络读过来）
在这里插入图片描述

TextInputFormat 源码分析

1、先从FileInputFormat 到 TextInputFormat
2、然后从 TextInputFormat 到 LineRecordReader
在这里插入图片描述

TextInoutFormat工具类中提供了一个方法，返回了一个LineRecordReader对象。

在LineRecordReader类的initial方法中，将fs.open打开的字节输入流封装成字符输入流，可以通过readLine方法读取一行数据。

//泛型，交给map的键值对应该是什么样的，这里的泛型就是什么
public class TextInputFormat extends FileInputFormat<LongWritable, Text> {
//继承自文件输入格式化类，专门用于文件读取
//FileInputFormat继承自InputFormat，已经实现了对文件的分片逻辑，所以子类只需要实现createRecordReader，指定输入流的格式即可
  @Override
  public RecordReader<LongWritable, Text> 
    createRecordReader(InputSplit split,
                       TaskAttemptContext context) {
    String delimiter = context.getConfiguration().get(
        "textinputformat.record.delimiter");//获取行分隔符，没有指定的话，默认是回车
    byte[] recordDelimiterBytes = null;
    if (null != delimiter) //分隔符非空时，获取分隔符的字节数组传入
      recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    return new LineRecordReader(recordDelimiterBytes); //在格式化的类中，返回一个具体的IO流用于读取数据，也就是说TextInputFormat不是具体的IO流，只是相当于一个工具类，LineRecordReader 才是具体的IO流
  }

  @Override
  //重写了可不可切方法，根据压缩编码来进行判断
  protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec =
      new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    if (null == codec) {
      return true;
    }
    return codec instanceof SplittableCompressionCodec;
  }

}

LineRecordReader才是真正提供IO功能的对象，InputFormat相当于不同的输入文件的格式化类。最重要的就是其中的initial方法，规定了文件的读取格式

@InterfaceAudience.LimitedPrivate({"MapReduce", "Pig"})
@InterfaceStability.Evolving
public class LineRecordReader extends RecordReader<LongWritable, Text> {
  private static final Log LOG = LogFactory.getLog(LineRecordReader.class);
  public static final String MAX_LINE_LENGTH = 
    "mapreduce.input.linerecordreader.line.maxlength";

  private long start;
  private long pos;
  private long end;
  private SplitLineReader in;
  private FSDataInputStream fileIn;
  private Seekable filePosition;
  private int maxLineLength;
  private LongWritable key;
  private Text value;
  private boolean isCompressedInput;
  private Decompressor decompressor;
  private byte[] recordDelimiterBytes;

  public LineRecordReader() {
  }

  public LineRecordReader(byte[] recordDelimiter) {
    this.recordDelimiterBytes = recordDelimiter;
  }

  //参数：待读取的分片，以及一个环境配置信息	
  public void initialize(InputSplit genericSplit,
                         TaskAttemptContext context) throws IOException {
    FileSplit split = (FileSplit) genericSplit;//先将切片的顶级父类转成实现类，很多方法只有实现类上才有
    Configuration job = context.getConfiguration(); //读取环境配置信息
    this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);//环境配置中规定的行最大字节数，没有规定就是21亿个字节
    start = split.getStart(); //切片在整个文件中的起始字节位置
    end = start + split.getLength(); //起始字节+切片长度 = 切片的结束位置
    final Path file = split.getPath(); //待处理文件的路径，hdfs路径

    // open the file and seek to the start of the split
    final FileSystem fs = file.getFileSystem(job);//通过路径加conf，获取一个到HDFS的连接，获取一个文件系统对象
    fileIn = fs.open(file);//打开一个输入流InputStream字节输入流
    
    CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);//获取文件的压缩编码[zip,rar,7z不同的 压缩方式有自己的编码]
    if (null!=codec) { //是一个压缩文件
      isCompressedInput = true;	//是压缩文件
      decompressor = CodecPool.getDecompressor(codec);//通过编码获取一个解码器
      if (codec instanceof SplittableCompressionCodec) {//判断是不是可切的压缩编码，如果是可切压缩
        final SplitCompressionInputStream cIn =
          ((SplittableCompressionCodec)codec).createInputStream(
            fileIn, decompressor, start, end,
            SplittableCompressionCodec.READ_MODE.BYBLOCK); 
        in = new CompressedSplitLineReader(cIn, job,
            this.recordDelimiterBytes); //通过CompressedSplitLineReader 读去记录
        start = cIn.getAdjustedStart();
        end = cIn.getAdjustedEnd();
        filePosition = cIn;
      } else { //如果不是一个可切压缩，通过SplitLineReader 读取，这里的切是指能不能切成一行一行的
        in = new SplitLineReader(codec.createInputStream(fileIn,
            decompressor), job, this.recordDelimiterBytes);
        filePosition = fileIn;
      }
    } else { //不是一个压缩文件，通过SplitLineReader 读取，将fileIn这个字节输入流转换成一个字符输入流LineReader，提供readLine方法
      fileIn.seek(start);
      in = new SplitLineReader(fileIn, job, this.recordDelimiterBytes);
      filePosition = fileIn; 
    }
    // If this is not the first split, we always throw away first record
    // because we always (except the last split) read one extra line in
    // next() method.
    if (start != 0) { //如果不是第一个切片，因为只有第一个切片的start是0
      start += in.readLine(new Text(), 0, maxBytesToConsume(start)); // 通过输入流读取第一行的字节数，将start向后移动第一行的字节数，也就是不要第一个行，将第一行交给上一个map进行处理，保证一行数据的完整性。block可能会切断临界行。
    }
    this.pos = start; //赋值新的位置
  }
  

  private int maxBytesToConsume(long pos) {
    return isCompressedInput
      ? Integer.MAX_VALUE
      : (int) Math.max(Math.min(Integer.MAX_VALUE, end - pos), maxLineLength);
  }

  private long getFilePosition() throws IOException {
    long retVal;
    if (isCompressedInput && null != filePosition) {
      retVal = filePosition.getPos();
    } else {
      retVal = pos;
    }
    return retVal;
  }

  private int skipUtfByteOrderMark() throws IOException {
    // Strip BOM(Byte Order Mark)
    // Text only support UTF-8, we only need to check UTF-8 BOM
    // (0xEF,0xBB,0xBF) at the start of the text stream.
    int newMaxLineLength = (int) Math.min(3L + (long) maxLineLength,
        Integer.MAX_VALUE);
    int newSize = in.readLine(value, newMaxLineLength, maxBytesToConsume(pos));
    // Even we read 3 extra bytes for the first line,
    // we won't alter existing behavior (no backwards incompat issue).
    // Because the newSize is less than maxLineLength and
    // the number of bytes copied to Text is always no more than newSize.
    // If the return size from readLine is not less than maxLineLength,
    // we will discard the current line and read the next line.
    pos += newSize;
    int textLength = value.getLength();
    byte[] textBytes = value.getBytes();
    if ((textLength >= 3) && (textBytes[0] == (byte)0xEF) &&
        (textBytes[1] == (byte)0xBB) && (textBytes[2] == (byte)0xBF)) {
      // find UTF-8 BOM, strip it.
      LOG.info("Found UTF-8 BOM and skipped it");
      textLength -= 3;
      newSize -= 3;
      if (textLength > 0) {
        // It may work to use the same buffer and not do the copyBytes
        textBytes = value.copyBytes();
        value.set(textBytes, 3, textLength);
      } else {
        value.clear();
      }
    }
    return newSize;
  }

  public boolean nextKeyValue() throws IOException {
    if (key == null) {
      key = new LongWritable();
    }
    key.set(pos);
    if (value == null) {
      value = new Text();
    }
    int newSize = 0;
    // We always read one extra line, which lies outside the upper
    // split limit i.e. (end - 1)
    while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
      if (pos == 0) {
        newSize = skipUtfByteOrderMark();
      } else {
        newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));
        pos += newSize;
      }

      if ((newSize == 0) || (newSize < maxLineLength)) {
        break;
      }

      // line too long. try again
      LOG.info("Skipped line of size " + newSize + " at pos " + 
               (pos - newSize));
    }
    if (newSize == 0) {
      key = null;
      value = null;
      return false;
    } else {
      return true;
    }
  }

  @Override
  public LongWritable getCurrentKey() {
    return key;
  }

  @Override
  public Text getCurrentValue() {
    return value;
  }

  /**
   * Get the progress within the split
   */
  public float getProgress() throws IOException {
    if (start == end) {
      return 0.0f;
    } else {
      return Math.min(1.0f, (getFilePosition() - start) / (float)(end - start));
    }
  }
  
  public synchronized void close() throws IOException {
    try {
      if (in != null) {
        in.close();
      }
    } finally {
      if (decompressor != null) {
        CodecPool.returnDecompressor(decompressor);
        decompressor = null;
      }
    }
  }
}

自定义输入格式化类

源数据格式
写一个类继承InputFormat，考虑到切片过程相对比较繁琐，可以继承FileInputFormat，这个类中实现了getSplits方法【切片过程也没怎么灵活，也不需要变】，只需要覆盖createRecordReader方法即可，重写读取记录的逻辑

自定义输入格式类

package authinput;

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.util.LineReader;

import java.io.IOException;
import java.io.InputStream;
import java.net.URI;

//交给map的数据是：人名-分数
public class AuthInputFormat extends FileInputFormat<Text,Text> {

    @Override
    public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        //工具类，返回一个IO流用于读取切片数据
        return new AuthReader();
    }
}

class AuthReader extends RecordReader<Text,Text>{
    private LineReader in;
    private Text key;
    private Text value;
    private static final byte[] blank = new Text(" ").getBytes();
    //初始化一个真正的流
    // 初始化的方法，在这个方法中一般获取一个真正的用于读取数据的流
    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        // 获取地址，hdfs地址
        FileSplit fsplit = (FileSplit) split;
        Path path = fsplit.getPath();
        URI uri = URI.create(path.toString());
        // 连接HDFS
        FileSystem fs = FileSystem.get(uri, context.getConfiguration());
        // 获取输入流用于读取数据
        InputStream fileIn = fs.open(path);
        //将输入流封装成字符流，并且可以按行读取，可以封装成BufferdReader，或者hadoop提供的LineReader
        // 目的是为了读取3行，每3行组成1条数据 - 字节流在读取的时候还得自己判断什么时候才读完三行
        // 考虑将字节流包装成字符流，并且这个字符流最好能够按行读取
        in = new LineReader(fileIn);
    }

    //map阶段判断是否还有下一个值时会调用这个方法
    // 判断是否有下一个键值对来提供给map方法处理，如果有返回true
    // 可以试图读取数据，如果读取到了数据，那么说明还有数据提供给map处理，返回true
    // 如果没有读取到数据，那么说明所有的数据已经读取完了，那么map方法也没有数据处理，返回false
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        //初始化变量
        key = new Text();
        value = new Text();
        Text tmp = new Text();
        // 试图读取第一行
        // 如果没有读取到数据，返回0
        if (in.readLine(tmp) == 0) //没有读到第一行
            return false;
        key.set(tmp.toString());
        // 读取第二行
        if (in.readLine(tmp) == 0)
            return false;
        value.set(tmp.toString());
        // 需要在值之间拼接空格,拼接字节数组
        value.append(blank, 0, blank.length);
        // 读取第三行
        if (in.readLine(tmp) == 0)
            return false;
        byte[] data = tmp.getBytes();
        value.append(data, 0, data.length);
        // 读取完成之后的结构
        // key = tom
        // value = math 90 english 98
        return true;
    }

    //获取key值
    @Override
    public Text getCurrentKey() throws IOException, InterruptedException {
        return key;
    }

    // 获取值
    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        return value;
    }

    // 获取当前MapTask的执行进度 - 实际上就是获取数据读取的进度
    // 是否覆盖这个方法对整个结果没有影响
    @Override
    public float getProgress() throws IOException, InterruptedException {
        return 0;
    }
    // 回收关流
    @Override
    public void close() throws IOException {
        if (in!= null)
            in.close();
        key = null;
        value = null;
    }
}

Mapper类

package authinput;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class AuthMapper extends Mapper<Text,Text,Text,Score> {
    @Override
    protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
        String[] s = value.toString().split(" ");
        Score sc = new Score();
        sc.setMath(Integer.parseInt(s[1]));
        sc.setEnglish(Integer.parseInt(s[3]));
        context.write(key,sc);
    }
}

Reducer类

package authinput;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

public class AuthReducer extends Reducer<Text,Score,Text, IntWritable> {
    @Override
    protected void reduce(Text key, Iterable<Score> values, Context context) throws IOException, InterruptedException {
        int sum = 0;
        for (Score value : values) {
            sum = sum + value.getMath() + value.getEnglish();
        }
        context.write(key,new IntWritable(sum));
    }
}

Driver类

package authinput;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class Driver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);
        job.setJarByClass(Driver.class);
        job.setMapperClass(AuthMapper.class);
        job.setReducerClass(AuthReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(Score.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        // 指定输入的自定义格式类
        job.setInputFormatClass(AuthInputFormat.class);
        FileInputFormat.addInputPath(job, new Path("hdfs://hadoop01:9000/txt/score3.txt"));
        FileOutputFormat.setOutputPath(job, new Path("hdfs://hadoop01:9000/result/authinput"));
        job.waitForCompletion(true);
    }
}

Model类

package authinput;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

public class Score implements Writable {
    private int math;
    private int english;

    public int getMath() {
        return math;
    }

    public void setMath(int math) {
        this.math = math;
    }

    public int getEnglish() {
        return english;
    }

    public void setEnglish(int english) {
        this.english = english;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(math);
        out.writeInt(english);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.math = in.readInt();
        this.english = in.readInt();
    }
}

多源输入

一次性的指定多个输入路径，并且允许为不同的输入路径来指定不同的InputFormat以及Mapper类

正常开发map以及reduce阶段

package cn.tedu.multipleinputs;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

// 处理score.txt
public class ScoreMapper1 extends Mapper<LongWritable, Text, Text, IntWritable> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        // Alex 64 63 68
        String[] arr = value.toString().split(" ");
        Text name = new Text(arr[0]);
        for (int i = 1; i < arr.length; i++) {
            context.write(name, new IntWritable(Integer.parseInt(arr[i])));
        }

    }
}

package cn.tedu.multipleinputs;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

// 处理score3.txt
public class ScoreMapper2 extends Mapper<Text, Text, Text, IntWritable> {
    @Override
    protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
        // key = tom
        // value = math 90 english 98
        String[] arr = value.toString().split(" ");
        context.write(key, new IntWritable(Integer.parseInt(arr[1])));
        context.write(key, new IntWritable(Integer.parseInt(arr[3])));
    }
}

package cn.tedu.multipleinputs;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.text.DecimalFormat;

public class ScoreReducer extends Reducer<Text, IntWritable, Text, Text> {
    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        double sum = 0;
        int count = 0;
        for (IntWritable val : values) {
            sum += val.get();
            count++;
        }
        double avg = sum / count;
        DecimalFormat df = new DecimalFormat("0.00");
        String str = df.format(avg);
        context.write(key, new Text(str));
    }
}

只需要修改Driver即可

指定多源输入，并且指定输入格式化类以及相应的mapper;不用单独setInputFormatClass

package cn.tedu.multipleinputs;

import cn.tedu.authinput.AuthInputFormat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

public class ScoreDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(ScoreDriver.class);
        job.setReducerClass(ScoreReducer.class);

        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // 多源输入
        MultipleInputs.addInputPath(job, new Path("hdfs://hadoop01:9000/txt/score.txt"),
                TextInputFormat.class, ScoreMapper1.class);
        MultipleInputs.addInputPath(job, new Path("hdfs://hadoop01:9000/txt/score3.txt"),
                AuthInputFormat.class, ScoreMapper2.class);

        FileOutputFormat.setOutputPath(job,
                new Path("hdfs://hadoop01:9000/result/multipleinputs"));
        job.waitForCompletion(true);
    }
}