hadoop split 分片
split
Hadoop中block块大小和split切片大小会影响到MapReduce程序在运行过程中的效率、map的个数。
通过例子 debug 查看hadoop中split的计算方法是怎么设计的。
FileInputFormat
FileInputFormat类在org.apache.hadoop.mapreduce.lib.input包下
computeSplitSize
/** 计算切片大小
* blockSize
* minSize 1
* maxSize 9223372036854775807
*/
protected long computeSplitSize(long blockSize, long minSize,
long maxSize) {
// maxSize blockSize 取最小是 blockSize
//然后 minSize 与 blockSize 取最大 结果是 blockSize
return Math.max(minSize, Math.min(maxSize, blockSize));
}
/**
* 获取splits集合
*/
public List<InputSplit> getSplits(JobContext job) throws IOException {
StopWatch sw = new StopWatch().start();
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); //1
long maxSize = getMaxSplitSize(job);//9223372036854775807
// 创建splits集合
List<InputSplit> splits = new ArrayList<InputSplit>();
//得到hdfs文件列表
List<FileStatus> files = listStatus(job);
//对文件列表进行遍历
for (FileStatus file: files) {
Path path = file.getPath();//路径
long length = file.getLen();//文件长度
if (length != 0) {
//文件块的文位置
BlockLocation[] blkLocations;
if (file instanceof LocatedFileStatus) {
blkLocations = ((LocatedFileStatus) file).getBlockLocations(); //得到文件块的位置
} else {
FileSystem fs = path.getFileSystem(job.getConfiguration());
blkLocations = fs.getFileBlockLocations(file, 0, length);
}
//判断文件是否可切分
if (isSplitable(job, path)) {
long blockSize = file.getBlockSize();//33554432
long splitSize = computeSplitSize(blockSize, minSize, maxSize); //blockSize
long bytesRemaining = length;
private static final double SPLIT_SLOP = 1.1; // 10% slop
当文件长度/splitsize > 1.1 时
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, splitSize,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));//splits添加分片
bytesRemaining -= splitSize;
}
当还有剩余的文件
if (bytesRemaining != 0) {
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(makeSplit(path, length-bytesRemaining, bytesRemaining,
blkLocations[blkIndex].getHosts(),
blkLocations[blkIndex].getCachedHosts()));//加入集合
}
} else { // 不可切分
splits.add(makeSplit(path, 0, length, blkLocations[0].getHosts(),
blkLocations[0].getCachedHosts()));
}
} else {
//Create empty hosts array for zero length files
//文件长度为0时创建空文件
splits.add(makeSplit(path, 0, length, new String[0]));
}
}
......
//返回切片集合
return splits;
}
protected boolean isSplitable(JobContext context, Path filename) {
return true;
}
protected FileSplit makeSplit(Path file, long start, long length,
String[] hosts, String[] inMemoryHosts) {
return new FileSplit(file, start, length, hosts, inMemoryHosts);
}
总结
split 分片的计算跟 blockSize, minSize, maxSize 三个参数有关
假如 blockSize 设置128 M
文件大小 200M
那么splitSize 就是128M
200/128=1.56>1.1 创建1个split (通过源码可以看出最右一个文件的大小在 0- 128+12.8M之间)
剩下72M在创建一个
就会产生2 个split
大文件不好模拟 可以通过下面参数设置 split 改小点 打断点就方便了
configuration.setLong(“mapred.max.split.size”,512);
configuration.setLong(“mapred.min.split.size”,512);