InputFormat(代码以TextInputFormat为例) :
FileinputFormat中有三个重要方法:
1).isSplitable
2).getSplits
3).createRecordReader
一.isSplitable方法:
改方法返回值为bool类型,判断是否进行分片。
二.getSplits方法:
该方法返回值为List。如果isSplitable返回值为ture,则该方法返回的为分块之后的文件,否则为没有分块的文件。
InputSplit是一个抽象类,FileSplit继承该类,FileSplit有以下属性:
private Path file; //文件路径
private long start; //该块存储的文件的起始下标
private long length; //该块存储块大小
private String[] hosts; //存储该块的主机
getSplits方法源码:
/**
* Generate the list of files and make them into FileSplits.
* @param job the job context
* @throws IOException
*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List<InputSplit> splits = new ArrayList<InputSplit>();
List<FileStatus> files = listStatus(job); //该方法会遍历输入目录和目录的子目录,将文件信息保存到List中。
for (FileStatus file: files) { //遍历该List将文件放入到 splits 中。
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
FileSystem fs = path.getFileSystem(job.getConfiguration());
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(job, path)) { //如果为分片
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize); //获取分片大小
long bytesRemaining = length;
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) { // SPLIT_SLOP值为1.1 如果 文件总大小/分片大小<1.1 即使该文件大小大于块大小,那该文件也不会分割。
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,blkLocations[blkIndex].getHosts())); // makeSplit方法: protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
// return new FileSplit(file, start, length, hosts);
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
blkLocations[blkIndex].getHosts()));
}
} else { // not splitable //如果不分片,则直接将文件放入到List中,起始位置为0,大小为文件总大小。
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
// Save the number of input files for metrics/loadgen
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
LOG.debug("Total # of splits: " + splits.size());
return splits;
}
三.createRecordReader方法:* Generate the list of files and make them into FileSplits.
* @param job the job context
* @throws IOException
*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List<InputSplit> splits = new ArrayList<InputSplit>();
List<FileStatus> files = listStatus(job); //该方法会遍历输入目录和目录的子目录,将文件信息保存到List中。
for (FileStatus file: files) { //遍历该List将文件放入到 splits 中。
Path path = file.getPath();
long length = file.getLen();
if (length != 0) {
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations();
} else {
FileSystem fs = path.getFileSystem(job.getConfiguration());
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
if (isSplitable(job, path)) { //如果为分片
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize); //获取分片大小
long bytesRemaining = length;
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) { // SPLIT_SLOP值为1.1 如果 文件总大小/分片大小<1.1 即使该文件大小大于块大小,那该文件也不会分割。
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,blkLocations[blkIndex].getHosts())); // makeSplit方法: protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) {
// return new FileSplit(file, start, length, hosts);
bytesRemaining -= splitSize; // bytesRemaining 减去 已经分片的大小splitSize
}if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
blkLocations[blkIndex].getHosts()));
}
} else { // not splitable //如果不分片,则直接将文件放入到List中,起始位置为0,大小为文件总大小。
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts()));
}
} else {
//Create empty hosts array for zero length files
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
// Save the number of input files for metrics/loadgen
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
LOG.debug("Total # of splits: " + splits.size());
return splits;
}
该方法返回一个RecordReader对象。以支持分片的LineRecordReader为例:
该对象有四个重要方法:
1).initialize
2).nextKeyValue
3).getCurrentKey
4).getCurrentValue
1.initialize方法:
public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
start = split.getStart(); //获取起始下标
end = start + split.getLength(); //获取大小
final Path file = split.getPath();
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); //根据文件后缀名获取相应解码器。 如果输入文件为压缩文件则会自动获取。
if (null!=codec) { //如果是压缩文件
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
if (codec instanceof SplittableCompressionCodec) {
final SplitCompressionInputStream cIn =
((SplittableCompressionCodec)codec).createInputStream(
fileIn, decompressor, start, end,
SplittableCompressionCodec.READ_MODE.BYBLOCK);
if (null == this.recordDelimiterBytes){ //recordDelimiterBytes为终止符,如果读取一行时遇到该符号则结束。
in = new LineReader(cIn, job); //获取到LineReader LineReader封装了一个InputStream
} else {
in = new LineReader(cIn, job, this.recordDelimiterBytes);
}
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;
} else {
if (null == this.recordDelimiterBytes) {
in = new LineReader(codec.createInputStream(fileIn, decompressor),
job);
} else {
in = new LineReader(codec.createInputStream(fileIn,
decompressor), job, this.recordDelimiterBytes);
}
filePosition = fileIn;
}
} else {
fileIn.seek(start);
if (null == this.recordDelimiterBytes){
in = new LineReader(fileIn, job);
} else {
in = new LineReader(fileIn, job, this.recordDelimiterBytes);
}
filePosition = fileIn;
}
// If this is not the first split, we always throw away first record
// because we always (except the last split) read one extra line in
// next() method.
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;
}
2.nextKeyValue方法:TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
start = split.getStart(); //获取起始下标
end = start + split.getLength(); //获取大小
final Path file = split.getPath();
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
fileIn = fs.open(file);
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file); //根据文件后缀名获取相应解码器。 如果输入文件为压缩文件则会自动获取。
if (null!=codec) { //如果是压缩文件
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
if (codec instanceof SplittableCompressionCodec) {
final SplitCompressionInputStream cIn =
((SplittableCompressionCodec)codec).createInputStream(
fileIn, decompressor, start, end,
SplittableCompressionCodec.READ_MODE.BYBLOCK);
if (null == this.recordDelimiterBytes){ //recordDelimiterBytes为终止符,如果读取一行时遇到该符号则结束。
in = new LineReader(cIn, job); //获取到LineReader LineReader封装了一个InputStream
} else {
in = new LineReader(cIn, job, this.recordDelimiterBytes);
}
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;
} else {
if (null == this.recordDelimiterBytes) {
in = new LineReader(codec.createInputStream(fileIn, decompressor),
job);
} else {
in = new LineReader(codec.createInputStream(fileIn,
decompressor), job, this.recordDelimiterBytes);
}
filePosition = fileIn;
}
} else {
fileIn.seek(start);
if (null == this.recordDelimiterBytes){
in = new LineReader(fileIn, job);
} else {
in = new LineReader(fileIn, job, this.recordDelimiterBytes);
}
filePosition = fileIn;
}
// If this is not the first split, we always throw away first record
// because we always (except the last split) read one extra line in
// next() method.
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;
}
public boolean nextKeyValue() throws IOException {
if (key == null) {
key = new LongWritable();
}
key.set(pos); //pos默认值为FileSplit的start,也就是行号。
if (value == null) {
value = new Text();
}
int newSize = 0;
// We always read one extra line, which lies outside the upper
// split limit i.e. (end - 1)
while (getFilePosition() <= end) {
newSize = in.readLine(value, maxLineLength, //获取到value
Math.max(maxBytesToConsume(pos), maxLineLength));
pos += newSize;
if (newSize < maxLineLength) {
break;
}
// line too long. try again
LOG.info("Skipped line of size " + newSize + " at pos " +
(pos - newSize));
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}
3.getCurrentKey 和 getCurrentValue 为获取 key valueif (key == null) {
key = new LongWritable();
}
key.set(pos); //pos默认值为FileSplit的start,也就是行号。
if (value == null) {
value = new Text();
}
int newSize = 0;
// We always read one extra line, which lies outside the upper
// split limit i.e. (end - 1)
while (getFilePosition() <= end) {
newSize = in.readLine(value, maxLineLength, //获取到value
Math.max(maxBytesToConsume(pos), maxLineLength));
pos += newSize;
if (newSize < maxLineLength) {
break;
}
// line too long. try again
LOG.info("Skipped line of size " + newSize + " at pos " +
(pos - newSize));
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}
看一下map的run方法:
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
cleanup(context);
}
}
setup(context);
try {
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
cleanup(context);
}
}
可能不是很全面,主要目的为个人备忘
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/29754888/viewspace-1249907/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/29754888/viewspace-1249907/