CombineFileInputFormat

最新推荐文章于 2019-05-15 16:46:41 发布

Mr-zhou

最新推荐文章于 2019-05-15 16:46:41 发布

阅读量1.9k

点赞数

分类专栏：云计算 hadoop hbase mapreduce

本文链接：https://blog.csdn.net/zhouleilei/article/details/9852203

版权

云计算 hadoop hbase mapreduce 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

package cn.mrzhou.test;

import java.io.IOException;

import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.util.LineReader;
/**
*
* @author zhoulei
* @version 1.0.0 2013-07-26
*/
public class MyCombineFileInputFormat extends CombineFileInputFormat<LongWritable, Text> {

@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
//这个地方返回的是CombinerFileRecordReader 其实在CombineFileRecordReader 其实将我们下面自定义的 CombineLineRecordReader 进行了一个封装

//并反射机制调用实际的CombineLineRecordReader的具体方法

//在 CombineFileRecordReader 构造方法中

//rrConstructor = rrClass.getDeclaredConstructor(constructorSignature);
// rrConstructor.setAccessible(true);

//initNextRecordReader();

//在initNextRecordReader用来访问实际CombineLineRecordReader 的构造方法如下:

//curReader = rrConstructor.newInstance(new Object [] {split, context, Integer.valueOf(idx)});

//这里要注意的是 idx 是一个序号，为什么要这个序号因为是Combine 所以每个split 可能有多个path 分别传进来

//这个idx 通过没调用一次initNextRecordReader idx++ 一次来递增

// 直道 if (idx == split.getNumPaths()) { //才结束
// return false;
// }

return new CombineFileRecordReader<LongWritable, Text>((CombineFileSplit)split, context, CombineLineRecordReader.class);
}

}

class CombineLineRecordReader extends RecordReader<LongWritable, Text>{
private long start;
private long end;
private long aLength;
private Path path;
private LineReader reader;
private long pos;
private LongWritable key;
private Text value;
private int maxLineLength;
//所以这个构造方法里面就会有一个Integer i 的形参。
public CombineLineRecordReader(InputSplit split,TaskAttemptContext context,Integer i) throws IOException{
  CombineFileSplit fileSplit = (CombineFileSplit)split;
  maxLineLength = context.getConfiguration().getInt("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);
  start = fileSplit.getOffset(i);
  aLength = fileSplit.getLength(i);
  end = start+aLength;
  path = fileSplit.getPath(i);

  FileSystem fs = FileSystem.get(context.getConfiguration());

  FSDataInputStream in = fs.open(path);
  boolean skipFirstLine = false;

  if (start != 0) {
   skipFirstLine = true;
   --start;
   in.seek(start);
        }
   reader = new LineReader(in);
   if (skipFirstLine) // skip first line and re-establish "startOffset".
         {
    int readNum = reader.readLine(new Text(),0,(int) Math.min((long) Integer.MAX_VALUE, end - start));
    start += readNum;
         }
   this.pos = start;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {

}

@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
   if (key == null) {
         key = new LongWritable();
       }
       key.set(pos); //偏移量作为key
       if (value == null) {
         value = new Text();
       }
       int newSize = 0;
       while (pos < end) {
         newSize = reader.readLine(value, maxLineLength,
                               Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
                                        maxLineLength));
         if (newSize == 0) {
           break;
         }
         pos += newSize;
         if (newSize < maxLineLength) {
           break;
         }
       }
       if (newSize == 0) {
         key = null;
         value = null;
           return false;
       } else {
         return true;
       }
}

@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return key;
}

@Override
public Text getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return value;
}

@Override
public float getProgress() throws IOException, InterruptedException {
   if (start == end) {
        return 0.0f;
      } else {
        return Math.min(1.0f, (pos - start) / (float)(end - start));
      }
}

@Override
public void close() throws IOException {
reader.close();
}

}

Mr-zhou

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
CombineFileInputFormat

package cn.mrzhou.test;import java.io.IOException;import org.apache.hadoop.fs.FSDataInputStream;import org.apache.hadoop.fs.FileSystem;import org.apache.hadoop.fs.Path;import org.apache.hado
复制链接

扫一扫

专栏目录