package cn.mrzhou.test;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
import org.apache.hadoop.mapreduce.lib.input.LineRecordReader;
import org.apache.hadoop.util.LineReader;
/**
*
* @author zhoulei
* @version 1.0.0 2013-07-26
*/
public class MyCombineFileInputFormat extends CombineFileInputFormat<LongWritable, Text> {
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {
//这个地方返回的是CombinerFileRecordReader 其实在CombineFileRecordReader 其实将 我们下面自定义的 CombineLineRecordReader 进行了一个封装
//并反射机制调用 实际的CombineLineRecordReader的 具体方法
//在 CombineFileRecordReader 构造方法中
//rrConstructor = rrClass.getDeclaredConstructor(constructorSignature);
// rrConstructor.setAccessible(true);
//initNextRecordReader();
//在initNextRecordReader用来访问 实际CombineLineRecordReader 的构造方法 如下:
//curReader = rrConstructor.newInstance(new Object [] {split, context, Integer.valueOf(idx)});
//这里要注意的是 idx 是一个序号,为什么要这个序号 因为是Combine 所以每个split 可能有多个path 分别传进来
//这个idx 通过没调用一次initNextRecordReader idx++ 一次来递增
// 直道 if (idx == split.getNumPaths()) { //才结束
// return false;
// }
return new CombineFileRecordReader<LongWritable, Text>((CombineFileSplit)split, context, CombineLineRecordReader.class);
}
}
class CombineLineRecordReader extends RecordReader<LongWritable, Text>{
private long start;
private long end;
private long aLength;
private Path path;
private LineReader reader;
private long pos;
private LongWritable key;
private Text value;
private int maxLineLength;
//所以这个构造方法里面 就会 有一个Integer i 的形参。
public CombineLineRecordReader(InputSplit split,TaskAttemptContext context,Integer i) throws IOException{
CombineFileSplit fileSplit = (CombineFileSplit)split;
maxLineLength = context.getConfiguration().getInt("mapred.linerecordreader.maxlength",Integer.MAX_VALUE);
start = fileSplit.getOffset(i);
aLength = fileSplit.getLength(i);
end = start+aLength;
path = fileSplit.getPath(i);
FileSystem fs = FileSystem.get(context.getConfiguration());
FSDataInputStream in = fs.open(path);
boolean skipFirstLine = false;
if (start != 0) {
skipFirstLine = true;
--start;
in.seek(start);
}
reader = new LineReader(in);
if (skipFirstLine) // skip first line and re-establish "startOffset".
{
int readNum = reader.readLine(new Text(),0,(int) Math.min((long) Integer.MAX_VALUE, end - start));
start += readNum;
}
this.pos = start;
}
@Override
public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
}
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if (key == null) {
key = new LongWritable();
}
key.set(pos); //偏移量作为key
if (value == null) {
value = new Text();
}
int newSize = 0;
while (pos < end) {
newSize = reader.readLine(value, maxLineLength,
Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
maxLineLength));
if (newSize == 0) {
break;
}
pos += newSize;
if (newSize < maxLineLength) {
break;
}
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}
@Override
public LongWritable getCurrentKey() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return key;
}
@Override
public Text getCurrentValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
return value;
}
@Override
public float getProgress() throws IOException, InterruptedException {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float)(end - start));
}
}
@Override
public void close() throws IOException {
reader.close();
}
}