MapReduce自定义LineRecordReader完成多行读取文件内容

最新推荐文章于 2022-09-23 15:33:33 发布

景天

最新推荐文章于 2022-09-23 15:33:33 发布

阅读量1k

点赞数

分类专栏：开发框架文章标签： hadoop

原文链接：https://blog.csdn.net/tanggao1314/article/details/51307642

版权

开发框架专栏收录该内容

8 篇文章

订阅专栏

首先声明文章转载于

https://blog.csdn.net/tanggao1314/article/details/51307642

由于原文章源码太多，这里只摘录文章的关键点

TextInputFormat是Hadoop默认的数据输入格式,但是它只能一行一行的读记录，如果要读取多行怎么办？
很简单自己写一个输入格式，然后写一个对应的Recordreader就可以了，但是要实现确不是这么简单的

首先看看TextInputFormat是怎么实现一行一行读取的

public class TextInputFormat extends FileInputFormat<LongWritable, Text> {

  @Override
  public RecordReader<LongWritable, Text> 
    createRecordReader(InputSplit split,
                       TaskAttemptContext context) {
    String delimiter = context.getConfiguration().get(
        "textinputformat.record.delimiter");
    byte[] recordDelimiterBytes = null;
    if (null != delimiter)
      recordDelimiterBytes = delimiter.getBytes(Charsets.UTF_8);
    return new LineRecordReader(recordDelimiterBytes);
  }
//这个对文件做压缩用的
  @Override
  protected boolean isSplitable(JobContext context, Path file) {
    final CompressionCodec codec =
      new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
    if (null == codec) {
      return true;
    }
    return codec instanceof SplittableCompressionCodec;
  }
}

我们只要看第一个createRecordReader方法即可，从源码分析可知，它new了一个LineRecordReader

它引入了一个SplitLineReader 类,用这个来读取每一行

发现没有 ===》 newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos));

它用了SplitLineReader 里面的一个方法readLine来读取，所以就得继续跟踪去看看SplitLineReader

public class SplitLineReader extends org.apache.hadoop.util.LineReader {
  public SplitLineReader(InputStream in, byte[] recordDelimiterBytes) {
    super(in, recordDelimiterBytes);
  }
  public SplitLineReader(InputStream in, Configuration conf,
      byte[] recordDelimiterBytes) throws IOException {
    super(in, conf, recordDelimiterBytes);
  }
  public boolean needAdditionalRecordAfterSplit() {
    return false;
  }
}

发现这家伙继承自LineReader，发现这里面根本就没有readLine方法，它可是继承了LineReader这个类，说不定他的父类LineReader有了
继续跟踪到LineReader的源码
它里面有很多方法，真的有我们要的readLine方法，说明我的推断没有错，没有忽悠大家，它重载了好几个readLine方法
其他我们不去管，我们只管对我们有用的，如下

 public int readLine(Text str, int maxLineLength,
                      int maxBytesToConsume) throws IOException {
    if (this.recordDelimiterBytes != null) {
      return readCustomLine(str, maxLineLength, maxBytesToConsume);
    } else {
      return readDefaultLine(str, maxLineLength, maxBytesToConsume);
    }
  }

它里面调用了readCustomLine方法和readDefaultLine方法，下面看看这两个方法

 private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume)
      throws IOException {
    str.clear();
    int txtLength = 0; // tracks str.getLength(), as an optimization
    long bytesConsumed = 0;
    int delPosn = 0;
    int ambiguousByteCount=0; // To capture the ambiguous characters count
    do {
      int startPosn = bufferPosn; // Start from previous end position
      if (bufferPosn >= bufferLength) {
        startPosn = bufferPosn = 0;
        bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
        if (bufferLength <= 0) {
          if (ambiguousByteCount > 0) {
            str.append(recordDelimiterBytes, 0, ambiguousByteCount);
            bytesConsumed += ambiguousByteCount;
          }
          break; // EOF
        }
      }
      for (; bufferPosn < bufferLength; ++bufferPosn) {
        if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
          delPosn++;
          if (delPosn >= recordDelimiterBytes.length) {
            bufferPosn++;
            break;
          }
        } else if (delPosn != 0) {
          bufferPosn--;
          delPosn = 0;
        }
      }
      int readLength = bufferPosn - startPosn;
      bytesConsumed += readLength;
      int appendLength = readLength - delPosn;
      if (appendLength > maxLineLength - txtLength) {
        appendLength = maxLineLength - txtLength;
      }
      bytesConsumed += ambiguousByteCount;
      if (appendLength >= 0 && ambiguousByteCount > 0) {
        //appending the ambiguous characters (refer case 2.2)
        str.append(recordDelimiterBytes, 0, ambiguousByteCount);
        ambiguousByteCount = 0;
        // since it is now certain that the split did not split a delimiter we
        // should not read the next record: clear the flag otherwise duplicate
        // records could be generated
        unsetNeedAdditionalRecordAfterSplit();
      }
      if (appendLength > 0) {
        str.append(buffer, startPosn, appendLength);
        txtLength += appendLength;
      }
      if (bufferPosn >= bufferLength) {
        if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
          ambiguousByteCount = delPosn;
          bytesConsumed -= ambiguousByteCount; //to be consumed in next
        }
      }
    } while (delPosn < recordDelimiterBytes.length 
        && bytesConsumed < maxBytesToConsume);
    if (bytesConsumed > Integer.MAX_VALUE) {
      throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
    }
    return (int) bytesConsumed; 
  }

 private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume)
  throws IOException {
    str.clear();
    int txtLength = 0; //tracks str.getLength(), as an optimization
    int newlineLength = 0; //length of terminating newline
    boolean prevCharCR = false; //true of prev char was CR
    long bytesConsumed = 0;
    do {
      int startPosn = bufferPosn; //starting from where we left off the last time
      if (bufferPosn >= bufferLength) {
        startPosn = bufferPosn = 0;
        if (prevCharCR) {
          ++bytesConsumed; //account for CR from previous read
        }
        bufferLength = fillBuffer(in, buffer, prevCharCR);
        if (bufferLength <= 0) {
          break; // EOF
        }
      }
      for (; bufferPosn < bufferLength; ++bufferPosn) { //search for newline
        if (buffer[bufferPosn] == LF) {
          newlineLength = (prevCharCR) ? 2 : 1;
          ++bufferPosn; // at next invocation proceed from following byte
          break;
        }
        if (prevCharCR) { //CR + notLF, we are at notLF
          newlineLength = 1;
          break;
        }
        prevCharCR = (buffer[bufferPosn] == CR);
      }
      int readLength = bufferPosn - startPosn;
      if (prevCharCR && newlineLength == 0) {
        --readLength; //CR at the end of the buffer
      }
      bytesConsumed += readLength;
      int appendLength = readLength - newlineLength;
      if (appendLength > maxLineLength - txtLength) {
        appendLength = maxLineLength - txtLength;
      }
      if (appendLength > 0) {
        str.append(buffer, startPosn, appendLength);
        txtLength += appendLength;
      }
    } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);

    if (bytesConsumed > Integer.MAX_VALUE) {
      throw new IOException("Too many bytes before newline: " + bytesConsumed);
    }
    return (int)bytesConsumed;
  }

注意注意readCustomLine和readDefaultLine方法的第一句都用了一句代码（重点，后面我们自定义时要改的地方）
他们都用了str.clear();
这句代码什么意思，意思是系统每读完一行，它会清空这一行的值！！！
如果我们自定义读取多行的时候，肯定不能清空它，因为我们需要它来计数第二行的位置
比如
123，
456
789，
111
如果一次读两行的话假如我把第一行清空了，那么我第二行的偏移量就得不到正确的值了，读出来的值本应该是
123，456
789，111
但是如果清空了的话就读出来少了一行
变成了
456
111

所以我们只有独到最后一行才清空值，它前面的行都不能清空

这就要求我们到时候自己重载一个方法了

我帮大家理一理：看都用到了哪些类

最开始TextInputFormat里面用到了LineRecordReader，LineRecordReader里面用到了SplitLineReader，而SplitLineReader里面用到了LineReader
自定义的时候思路按这个来
TextInputFormat–》LineRecordReader–》SplitLineReader–》LineReader

下面写第一个自定义的TextInputFormat 照抄即可

接着写一个自定义的LineRecordReader 也是照抄，然后重写如下方法

 public boolean nextKeyValue() throws IOException {
	  if (this.key == null) {
		  this.key = new LongWritable();
	  }
	  this.key.set(this.pos);
	  if (this.value == null) {
		  this.value = new Text();
	  }
	  int newSize = 0;
	  //因为这里要实现输出多行，所以添加一个for循环,由于前面的行不能清空，所以要加一个boolean标志量
	  boolean clear = true;
      for (int i = 1; i <= 2; i++) {
          if (i == 2) {
              clear = false;
          }
          while (getFilePosition() <= end || in.needAdditionalRecordAfterSplit()) {
              if (pos == 0) {
                  newSize = skipUtfByteOrderMark();
              } else {
                  newSize = in.readLine(value, maxLineLength, maxBytesToConsume(pos), clear);
                  pos += newSize;
              }
              if ((newSize == 0) || (newSize < maxLineLength)) {
                  break;
              }
              // line too long. try again
              LOG.info("Skipped line of size " + newSize + " at pos " + (pos - newSize));
          }
      }
	  if (newSize == 0) {
		  this.key = null;
		  this.value = null;
		  return false;
	  }
	  return true;
  }

再写自定义SplitLineReader 嗯。。copy it

最后写自定义LineReader 先抄再改
它重载了一个readLine(Text str, int maxLineLength, int maxBytesToConsume, boolean clear)方法来实现不清空前面读取的行的值

 /**
   *    重载了一个readLine(Text str, int maxLineLength, int maxBytesToConsume, boolean clear)方法
   *    来实现不清空前面读取的行的值
   * @param str
   * @param maxLineLength
   * @param maxBytesToConsume
   * @param clear
   * @return
   * @throws IOException
   */
  public int readLine(Text str, int maxLineLength, int maxBytesToConsume, boolean clear) throws IOException {
      if (this.recordDelimiterBytes != null) {
          return readCustomLine(str, maxLineLength, maxBytesToConsume,clear);
      } else {
          return readDefaultLine(str, maxLineLength, maxBytesToConsume,clear);
      }
  }

  private int readCustomLine(Text str, int maxLineLength, int maxBytesToConsume, boolean clear) throws IOException {
      if (clear) {
          str.clear();
      }
      int txtLength = 0; // tracks str.getLength(), as an optimization
      long bytesConsumed = 0;
      int delPosn = 0;
      int ambiguousByteCount = 0; // To capture the ambiguous characters count
      do {
          int startPosn = bufferPosn; // Start from previous end position
          if (bufferPosn >= bufferLength) {
              startPosn = bufferPosn = 0;
              bufferLength = fillBuffer(in, buffer, ambiguousByteCount > 0);
              if (bufferLength <= 0) {
                  if (ambiguousByteCount > 0) {
                      str.append(recordDelimiterBytes, 0, ambiguousByteCount);
                      bytesConsumed += ambiguousByteCount;
                  }
                  break; // EOF
              }
          }
          for (; bufferPosn < bufferLength; ++bufferPosn) {
              if (buffer[bufferPosn] == recordDelimiterBytes[delPosn]) {
                  delPosn++;
                  if (delPosn >= recordDelimiterBytes.length) {
                      bufferPosn++;
                      break;
                  }
              } else if (delPosn != 0) {
                  bufferPosn--;
                  delPosn = 0;
              }
          }
          int readLength = bufferPosn - startPosn;
          bytesConsumed += readLength;
          int appendLength = readLength - delPosn;
          if (appendLength > maxLineLength - txtLength) {
              appendLength = maxLineLength - txtLength;
          }
          bytesConsumed += ambiguousByteCount;
          if (appendLength >= 0 && ambiguousByteCount > 0) {
              // appending the ambiguous characters (refer case 2.2)
              str.append(recordDelimiterBytes, 0, ambiguousByteCount);
              ambiguousByteCount = 0;
              // since it is now certain that the split did not split a
              // delimiter we
              // should not read the next record: clear the flag otherwise
              // duplicate
              // records could be generated
              unsetNeedAdditionalRecordAfterSplit();
          }
          if (appendLength > 0) {
              str.append(buffer, startPosn, appendLength);
              txtLength += appendLength;
          }
          if (bufferPosn >= bufferLength) {
              if (delPosn > 0 && delPosn < recordDelimiterBytes.length) {
                  ambiguousByteCount = delPosn;
                  bytesConsumed -= ambiguousByteCount; // to be consumed in
                                                          // next
              }
          }
      } while (delPosn < recordDelimiterBytes.length && bytesConsumed < maxBytesToConsume);
      if (bytesConsumed > Integer.MAX_VALUE) {
          throw new IOException("Too many bytes before delimiter: " + bytesConsumed);
      }
      return (int) bytesConsumed;
  }

private int readDefaultLine(Text str, int maxLineLength, int maxBytesToConsume, boolean clear) throws IOException {
      if (clear) {
          str.clear();
      }
      int txtLength = 0; // tracks str.getLength(), as an optimization
      int newlineLength = 0; // length of terminating newline
      boolean prevCharCR = false; // true of prev char was CR
      long bytesConsumed = 0;
      do {
          int startPosn = bufferPosn; // starting from where we left off the
                                      // last time
          if (bufferPosn >= bufferLength) {
              startPosn = bufferPosn = 0;
              if (prevCharCR) {
                  ++bytesConsumed; // account for CR from previous read
              }
              bufferLength = fillBuffer(in, buffer, prevCharCR);
              if (bufferLength <= 0) {
                  break; // EOF
              }
          }
          for (; bufferPosn < bufferLength; ++bufferPosn) { // search for
                                                              // newline
              if (buffer[bufferPosn] == LF) {
                  newlineLength = (prevCharCR) ? 2 : 1;
                  ++bufferPosn; // at next invocation proceed from following
                                  // byte
                  break;
              }
              if (prevCharCR) { // CR + notLF, we are at notLF
                  newlineLength = 1;
                  break;
              }
              prevCharCR = (buffer[bufferPosn] == CR);
          }
          int readLength = bufferPosn - startPosn;
          if (prevCharCR && newlineLength == 0) {
              --readLength; // CR at the end of the buffer
          }
          bytesConsumed += readLength;
          int appendLength = readLength - newlineLength;
          if (appendLength > maxLineLength - txtLength) {
              appendLength = maxLineLength - txtLength;
          }
          if (appendLength > 0) {
              str.append(buffer, startPosn, appendLength);
              txtLength += appendLength;
          }
      } while (newlineLength == 0 && bytesConsumed < maxBytesToConsume);
      if (bytesConsumed > Integer.MAX_VALUE) {
          throw new IOException("Too many bytes before newline: " + bytesConsumed);
      }
      return (int) bytesConsumed;
  }

最后就可以来测试了

先看看测试前的文件内容
这里写图片描述

测试类

package com.my.lingRecordReader;

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;

import com.my.input.myInputFormat;

public class myTest { 
    //Map过程
    static int count=0;
    public static class MyTestMapper extends Mapper<LongWritable, Text, Text, LongWritable> {
        /***
         * 
         */
        @Override
        protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context)
                throws IOException, InterruptedException {
            //默认的map的value是每一行,我这里自定义的是以空格分割
            count++;
            //String[] vs = value.toString().split(",");
            //for (String v : vs) {
                //写出去
                context.write(new Text(value), key);
            //}
            System.out.println("========>"+count);
        }
    }

    public static void main(String[] args) {

        Configuration conf=new Configuration();
        try {
            //args从控制台获取路径 解析得到域名
            String[] paths=new GenericOptionsParser(conf,args).getRemainingArgs();
            if(paths.length<2){
                throw new RuntimeException("必須輸出 輸入 和输出路径");
            }
            //得到一个Job 并设置名字
            Job job=Job.getInstance(conf,"myTest");
            //设置Jar 使本程序在Hadoop中运行
            job.setJarByClass(myTest.class);
            //设置Map处理类
            job.setMapperClass(MyTestMapper.class);
            job.setInputFormatClass(MyTextInputFormat.class);
            //设置map的输出类型,因为不一致,所以要设置
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(LongWritable.class);
            //设置输入和输出目录
            FileInputFormat.addInputPath(job, new Path(paths[0]));
            FileOutputFormat.setOutputPath(job, new Path(paths[1] + System.currentTimeMillis()));// 整合好结果后输出的位置
            //启动运行
            System.exit(job.waitForCompletion(true) ? 0:1);
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ClassNotFoundException e) {
            e.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

测试结果：
这里写图片描述