参数设置
Hive 参数hive.tez.auto.reducer.parallelism
设置是否可以自动设置 reduce 任务的并行度,默认是 false。
默认并行度的范围是从 默认reduceNum * 0.25 到 默认reduceNum * 2。系统根据参数 hive.exec.reducers.bytes.per.reducer
,如果本阶段所有 reduce 需要shuffle 的数据量,除以 hive.exec.reducers.bytes.per.reducer
,得到估计的 reduce 的数量。
在 reducer 的上一个阶段,还是按照默认的 reducer 的数量来输出数据到本地磁盘。如原来设置 reducer 的个数为 10,则上一阶段在运行的时候,shuffle 的数据还是分成 10 份。上一阶段的各 task 在结束的时候,汇报本 task 需要 shuffle 的数据量。
ShuffleVertexManager 根据 shuffle 的数据量和最大分区数量,最小分区数据计算目标的分区数量。
如 source vertex 的每个 task 的输出 partition 的数量是10,代表了原来默认target vertex 的并行度是 10. 现在改成 4, 除了最后一个 task 计算 1 份 output 外,其余都计算 3 份 output.
GenTezUtils
createReduceWork
if (reduceWork.isAutoReduceParallelism()) {
edgeProp =
new TezEdgeProperty(context.conf, edgeType, true, reduceWork.isSlowStart(),
reduceWork.getMinReduceTasks(), reduceWork.getMaxReduceTasks(), bytesPerReducer);
}
DagUtils#setupAutoReducerParallelism
private void setupAutoReducerParallelism(TezEdgeProperty edgeProp, Vertex v)
throws IOException {
if (edgeProp.isAutoReduce()) {
Configuration pluginConf = new Configuration(false);
VertexManagerPluginDescriptor desc =
VertexManagerPluginDescriptor.create(ShuffleVertexManager.class.getName());
pluginConf.setBoolean(
ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL, true);
pluginConf.setInt(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_MIN_TASK_PARALLELISM,
edgeProp.getMinReducer());
pluginConf.setLong(
ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
edgeProp.getInputSizePerReducer());
UserPayload payload = TezUtils.createUserPayloadFromConf(pluginConf);
desc.setUserPayload(payload);
v.setVertexManagerPlugin(desc);
}
}
ShuffleVertexManager#initConfiguration
根据配置生成 ShuffleVertexManagerBaseConfig 对象。
ShuffleVertexManagerBaseConfig initConfiguration() {
float slowStartMinFraction = conf.getFloat(
TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION,
TEZ_SHUFFLE_VERTEX_MANAGER_MIN_SRC_FRACTION_DEFAULT);
mgrConfig = new ShuffleVertexManagerConfig(
conf.getBoolean(
TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL,
TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL_DEFAULT),
conf.getLong(
TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE_DEFAULT),
slowStartMinFraction,
conf.getFloat(
TEZ_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION,
Math.max(slowStartMinFraction,
TEZ_SHUFFLE_VERTEX_MANAGER_MAX_SRC_FRACTION_DEFAULT)),
Math.max(1, conf
.getInt(TEZ_SHUFFLE_VERTEX_MANAGER_MIN_TASK_PARALLELISM,
TEZ_SHUFFLE_VERTEX_MANAGER_MIN_TASK_PARALLELISM_DEFAULT)));
return mgrConfig;
}
ShuffleVertexManagerBase#processPendingTasks
This method is called onVertexStarted, onSourceTaskCompleted, handleVertexStateUpdate.
private void processPendingTasks(TaskAttemptIdentifier completedSourceAttempt) {
if(config.isAutoParallelismEnabled()) {
if (!determineParallelismAndApply()) {
//try to determine parallelism later when more info is available.
return;
}
}
processPendingTasks();
schedulePendingTasks(completedSourceAttempt);
}
private boolean determineParallelismAndApply() {
return determineParallelismAndApply(
getMinSourceVertexCompletedTaskFraction());
}
@VisibleForTesting
boolean determineParallelismAndApply(
float minSourceVertexCompletedTaskFraction) {
ReconfigVertexParams params = computeRouting();
if (params != null) {
reconfigVertex(params.getFinalParallelism());
updatePendingTasks();
postReconfigVertex();
}
}
computeRouting
computeRouting
确定任务最终的并行度,并且为每个 SourceVertexInfo 对象分配一个 EdgeManagerPluginDescriptor 对象。
ReconfigVertexParams computeRouting() {
// Omit some lines
EdgeManagerPluginDescriptor descriptor =
EdgeManagerPluginDescriptor.create(CustomShuffleEdgeManager.class.getName());
descriptor.setUserPayload(edgeManagerConfig.toUserPayload());
Iterable<Map.Entry<String, SourceVertexInfo>> bipartiteItr = getBipartiteInfo();
for(Map.Entry<String, SourceVertexInfo> entry : bipartiteItr) {
entry.getValue().newDescriptor = descriptor;
}
ReconfigVertexParams params =
new ReconfigVertexParams(finalTaskParallelism, null);
return params;
}
reconfigVertex
private void reconfigVertex(final int finalTaskParallelism) {
Map<String, EdgeProperty> edgeProperties =
new HashMap<String, EdgeProperty>(bipartiteSources);
// Omit some lines
getContext().reconfigureVertex(finalTaskParallelism, null, edgeProperties);
}
VertexManager$VertexManagerPluginContextImpl#reconfigureVertex
@Override
public synchronized void reconfigureVertex(int parallelism, VertexLocationHint vertexLocationHint,
Map<String, EdgeProperty> sourceEdgeProperties,
Map<String, InputSpecUpdate> rootInputSpecUpdate) {
checkAndThrowIfDone();
try {
managedVertex.reconfigureVertex(parallelism, vertexLocationHint, sourceEdgeProperties,
rootInputSpecUpdate);
} catch (AMUserCodeException e) {
throw new TezUncheckedException(e);
}
}
VertexImpl#reconfigureVertex
@Override
public void reconfigureVertex(int parallelism,
@Nullable VertexLocationHint locationHint,
@Nullable Map<String, EdgeProperty> sourceEdgeProperties) throws AMUserCodeException {
setParallelismWrapper(parallelism, locationHint, sourceEdgeProperties, null, true);
}
VertexImpl#setParallelismWrapper
重新调整 reduces 任务。
private void setParallelismWrapper(int parallelism, VertexLocationHint vertexLocationHint,
Map<String, EdgeProperty> sourceEdgeProperties,
Map<String, InputSpecUpdate> rootInputSpecUpdates,
boolean fromVertexManager) throws AMUserCodeException {
this.setParallelismCalledFlag = true;
try {
// Input initializer/Vertex Manager/1-1 split expected to set parallelism.
if (numTasks == -1) {
stateChangeNotifier.stateChanged(vertexId,
new VertexStateUpdateParallelismUpdated(vertexName, numTasks, oldNumTasks));
this.createTasks();
setVertexLocationHint(vertexLocationHint);
LOG.info("Vertex " + getLogIdentifier() +
" parallelism set to " + parallelism);
if (canInitVertex()) {
getEventHandler().handle(new VertexEvent(getVertexId(), VertexEventType.V_READY_TO_INIT));
}
} else {
if (parallelism > numTasks) {
addTasks((parallelism));
} else if (parallelism < numTasks) {
removeTasks(parallelism);
}
// notify listeners
stateChangeNotifier.stateChanged(vertexId,
new VertexStateUpdateParallelismUpdated(vertexName, numTasks, oldNumTasks));
} finally {
writeLock.unlock();
}
}
postReconfigVertex
postReconfigVertex 重新定义 source vertex 的 outputs 和 target vertex 的对应关系。
如源 target vertex 的并行度是 10, source vertex 在计算时,把 output 分成 10 份,每份对应 target vertex 的一个 task。
现在把 target vertex 的的并行度重新调整为 4,那么前三个 task,每个 task 处理 3 份 output 数据,最后一个 task 处理 1 份 output 数据。
@Override
void postReconfigVertex() {
configureTargetMapping(pendingTasks.size());
}
把每个 task 处理的 outputs 对应的数据放到二维数组 targetIndexes 里。
此数组第1维的长度等于 task 的数量,targetIndexes[taskId] 对应的数组,就是此 task 要处理的 count。
private void configureTargetMapping(int tasks) {
targetIndexes = new int[tasks][];
for (int idx = 0; idx < tasks; ++idx) {
int partitionRange = basePartitionRange;
if (idx == (tasks - 1)) {
partitionRange = ((remainderRangeForLastShuffler > 0)
? remainderRangeForLastShuffler : basePartitionRange);
}
// skip the basePartitionRange per destination task
targetIndexes[idx] = createIndices(partitionRange, idx, basePartitionRange);
if (LOG.isDebugEnabled()) {
LOG.debug("targetIdx[{}] to {}", idx,
Arrays.toString(targetIndexes[idx]));
}
}
}
TaskAttemptImpl
当 task attempt 成功时,向 vertex 发送一个 VertexEventRouteEvent event.
List<TezEvent> tezEvents = taFinishedEvent.getTAGeneratedEvents();
if (tezEvents != null && !tezEvents.isEmpty()) {
ta.sendEvent(new VertexEventRouteEvent(ta.getVertexID(), tezEvents));
}
VertexImpl
private static class RouteEventTransition implements
MultipleArcTransition<VertexImpl, VertexEvent, VertexState> {
@Override
public VertexState transition(VertexImpl vertex, VertexEvent event) {
VertexEventRouteEvent rEvent = (VertexEventRouteEvent) event;
List<TezEvent> tezEvents = rEvent.getEvents();
try {
vertex.handleRoutedTezEvents(tezEvents, false);
} catch (AMUserCodeException e) {
...
}
return vertex.getState();
}
}
handleRoutedTezEvents
vertex 发送事件给 target vertex.
case DATA_MOVEMENT_EVENT:
case COMPOSITE_DATA_MOVEMENT_EVENT:
{
if (isEventFromVertex(this, sourceMeta)) {
// event from this vertex. send to destination vertex
TezTaskAttemptID srcTaId = sourceMeta.getTaskAttemptID();
if (tezEvent.getEventType() == EventType.DATA_MOVEMENT_EVENT) {
((DataMovementEvent) tezEvent.getEvent()).setVersion(srcTaId.getId());
} else if (tezEvent.getEventType() == EventType.COMPOSITE_DATA_MOVEMENT_EVENT) {
((CompositeDataMovementEvent) tezEvent.getEvent()).setVersion(srcTaId.getId());
} else {
((InputFailedEvent) tezEvent.getEvent()).setVersion(srcTaId.getId());
}
Vertex destVertex = getDAG().getVertex(sourceMeta.getEdgeVertexName());
Edge destEdge = targetVertices.get(destVertex);
eventHandler.handle(new VertexEventRouteEvent(destVertex
.getVertexId(), Collections.singletonList(tezEvent)));
}
handleRoutedTezEvents
Target vertex 收到此 event 后,如果没有调度 task,则放到 pendingTaskEvents 队列里,等待调度之后,再重新调度 pendingTaskEvents 队列的事件。
如果已经调度,则会调用 processOnDemandEvent。
if (tasksNotYetScheduled) {
// this is only needed to support mixed mode routing. Else for
// on demand routing events can be directly added to taskEvents
// when legacy routing is removed then pending task events can be
// removed.
pendingTaskEvents.add(tezEvent);
} else {
// event not from this vertex. must have come from source vertex.
int srcTaskIndex = sourceMeta.getTaskAttemptID().getTaskID().getId();
Vertex edgeVertex = getDAG().getVertex(sourceMeta.getTaskVertexName());
Edge srcEdge = sourceVertices.get(edgeVertex);
if (srcEdge.hasOnDemandRouting()) {
processOnDemandEvent(tezEvent, srcEdge, srcTaskIndex);
} else {
// send to tasks
srcEdge.sendTezEventToDestinationTasks(tezEvent);
}
}
Vertex#processOnDemandEvent
private void processOnDemandEvent(TezEvent tezEvent, Edge srcEdge, int srcTaskIndex) {
onDemandRouteEvents.add(new EventInfo(tezEvent, srcEdge, srcTaskIndex));
}
VertexImpl#getTaskAttemptTezEvents
The target vertex task attempt get its events.
@Override
public TaskAttemptEventInfo getTaskAttemptTezEvents(TezTaskAttemptID attemptID,
int fromEventId, int preRoutedFromEventId, int maxEvents) {
Task task = getTask(attemptID.getTaskID());
ArrayList<TezEvent> events = task.getTaskAttemptTezEvents(
attemptID, preRoutedFromEventId, maxEvents);
int nextPreRoutedFromEventId = preRoutedFromEventId + events.size();
int nextFromEventId = fromEventId;
try {
int currEventCount = onDemandRouteEvents.size();
for (nextFromEventId = fromEventId; nextFromEventId < currEventCount; ++nextFromEventId) {
boolean earlyExit = false;
if (events.size() == maxEvents) {
break;
}
EventInfo eventInfo = onDemandRouteEvents.get(nextFromEventId);
if (eventInfo.isObsolete) {
// ignore obsolete events
firstEventObsoleted = true;
continue;
}
TezEvent tezEvent = eventInfo.tezEvent;
switch(tezEvent.getEventType()) {
case INPUT_FAILED_EVENT:
case DATA_MOVEMENT_EVENT:
case COMPOSITE_DATA_MOVEMENT_EVENT:
{
int srcTaskIndex = eventInfo.eventTaskIndex;
Edge srcEdge = eventInfo.eventEdge;
PendingEventRouteMetadata pendingRoute = null;
if (isFirstEvent) {
// the first event is the one that can have pending routes because its expanded
// events had not been completely sent in the last round.
isFirstEvent = false;
pendingRoute = srcEdge.removePendingEvents(attemptID);
if (pendingRoute != null) {
// the first event must match the pending route event
// the only reason it may not match is if in between rounds that event got
// obsoleted
if(tezEvent != pendingRoute.getTezEvent()) {
Preconditions.checkState(firstEventObsoleted);
// pending routes can be ignored for obsoleted events
pendingRoute = null;
}
}
}
if (!srcEdge.maybeAddTezEventForDestinationTask(tezEvent, attemptID, srcTaskIndex,
events, maxEvents, pendingRoute)) {
// not enough space left for this iteration events.
// Exit and start from here next time
earlyExit = true;
}
}
break;
return new TaskAttemptEventInfo(nextFromEventId, events, nextPreRoutedFromEventId);
}
public boolean maybeAddTezEventForDestinationTask(TezEvent tezEvent, TezTaskAttemptID attemptID,
int srcTaskIndex, List<TezEvent> listToAdd, int listMaxSize,
PendingEventRouteMetadata pendingRoutes)
throws AMUserCodeException {
try {
EdgeManagerPluginOnDemand edgeManagerOnDemand = (EdgeManagerPluginOnDemand) edgeManager;
int taskIndex = attemptID.getTaskID().getId();
switch (tezEvent.getEventType()) {
case COMPOSITE_DATA_MOVEMENT_EVENT:
{
CompositeDataMovementEvent compEvent = (CompositeDataMovementEvent) tezEvent.getEvent();
CompositeEventRouteMetadata routeMeta = edgeManagerOnDemand
.routeCompositeDataMovementEventToDestination(srcTaskIndex, taskIndex);
if (routeMeta != null) {
CompositeRoutedDataMovementEvent edme = compEvent.expandRouted(routeMeta);
TezEvent tezEventToSend = new TezEvent(edme, tezEvent.getSourceInfo(), tezEvent.getEventReceivedTime());
tezEventToSend.setDestinationInfo(destinationMetaInfo);
listToAdd.add(tezEventToSend);
}
}
break;
ShuffleVertexManager$CustomShuffleEdgeManager
此对象管理和 source vertex 和 target vertex 并行度之间的对应关系。
@Override
public @Nullable CompositeEventRouteMetadata routeCompositeDataMovementEventToDestination(
int sourceTaskIndex, int destinationTaskIndex)
throws Exception {
int[] targetIndicesToSend;
int partitionRange;
if(destinationTaskIndex == (numDestinationTasks-1)) {
if (remainderRangeForLastShuffler != basePartitionRange) {
targetIndicesToSend = createTargetIndicesForRemainder(sourceTaskIndex);
} else {
targetIndicesToSend = targetIndices[sourceTaskIndex];
}
partitionRange = remainderRangeForLastShuffler;
} else {
targetIndicesToSend = targetIndices[sourceTaskIndex];
partitionRange = basePartitionRange;
}
return CompositeEventRouteMetadata.create(partitionRange, targetIndicesToSend[0],
sourceIndices[destinationTaskIndex][0]);
}