In order to change the average load for a reducer (in bytes):
set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
set mapreduce.job.reduces=<number>
默认情况下 reduceTasks=NumOf(Reduces)
1
2
3
4
5
6
7
8
// Divide it by 2 so that we can have more reducers
//BYTESPERREDUCER:bytes per reducer 每一个reducer处理多少字符
//numberOfBytes:Reduce程序接受到的字符数目
//maxReducers:Hive 配置中 MapR程序最大的Reduce数目
long bytesPerReducer =
context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer,
maxReducers, false);
//寻找到关于如何获得reduce数量的函数
//boolean powersOfTwo:是否按照2的幂来增加reducers
public static int estimateReducers(long totalInputFileSize, long bytesPerReducer,
int maxReducers, boolean powersOfTwo) {
//判断输入文件大小是否超过每个reduce处理的数量
//获得输入文件字符,不超过bytesPerReducer,则按照不超过bytesPerReducer考虑
double bytes = Math.max(totalInputFileSize, bytesPerReducer);//判断输入文件
//ceil向上取整,例0.1 取1,1.5取2
int reducers = (int) Math.ceil(bytes / bytesPerReducer);
//至少是1
reducers = Math.max(1, reducers);
//判断是否超过最大限制,超过则按照最大值取,不超过则按照计算值取
reducers = Math.min(maxReducers, reducers);
//计算reduce函数的log2(reducers),计算以2为底,reducers的对数值
int reducersLog = (int)(Math.log(reducers) / Math.log(2)) + 1;
//计算以2为底,reducersLog为幂的值
int reducersPowerTwo = (int)Math.pow(2, reducersLog);
if (powersOfTwo) {
// If the original number of reducers was a power of two, use that
if (reducersPowerTwo / 2 == reducers) {
// nothing to do
} else if (reducersPowerTwo > maxReducers) {
// If the next power of two greater than the original number of reducers is greater
// than the max number of reducers, use the preceding power of two, which is strictly
// less than the original number of reducers and hence the max
reducers = reducersPowerTwo / 2;
} else {
// Otherwise use the smallest power of two greater than the original number of reducers
reducers = reducersPowerTwo;
}
}
return reducers;
}