流程
1)balancer由master启动
2)默认balancer 是StochasticLoadBalancer
public static Class<? extends LoadBalancer> getDefaultLoadBalancerClass() { return StochasticLoadBalancer.class; }
3)balancer由AssignmentManager管理
this.assignmentManager = new AssignmentManager(this, serverManager, this.balancer, this.service, this.metricsMaster, this.tableLockManager);
4)集群状态守护进程ClusterStatusChore不断更新balancer里的集群信息。每1分钟一次
this.clusterStatusChore = new ClusterStatusChore(this, balancer);
protected void chore() { try { balancer.setClusterStatus(master.getClusterStatus()); } catch (InterruptedIOException e) { LOG.warn("Ignoring interruption", e); } }
5)balancer自己的守护进程BalancerChore
同样每5分钟调用一次均衡
public BalancerChore(HMaster master) { super(master.getServerName() + "-BalancerChore", master, master.getConfiguration().getInt( "hbase.balancer.period", 300000)); this.master = master; } @Override protected void chore() { try { master.balance(); } catch (IOException e) { LOG.error("Failed to balance.", e); } }
6)均衡如下:
public boolean balance(boolean force) throws IOException {
//此处有省略,主要是判断是否能进行
balance,比如有Service dead 不能, 有新启动 ,不能int maximumBalanceTime = getBalancerCutoffTime();
//获取每个表,对应的主机名,对应的
HRegionInfo,一个表放在了哪些机器,然后这台机器又有多少个这个表的region。Map<TableName, Map<ServerName, List<HRegionInfo>>> assignmentsByTable =
this.assignmentManager.getRegionStates().getAssignmentsByTable();
List<RegionPlan> plans = new ArrayList<RegionPlan>();
//Give the balancer the current cluster state.
this.balancer.setClusterStatus(getClusterStatus());
//按照每个表生成计划。
for (Entry<TableName, Map<ServerName, List<HRegionInfo>>> e : assignmentsByTable.entrySet()) {
List<RegionPlan> partialPlans = this.balancer.balanceCluster(e.getKey(), e.getValue());
if (partialPlans != null) plans.addAll(partialPlans);
}
long cutoffTime = System.currentTimeMillis() + maximumBalanceTime;
int rpCount = 0; // number of RegionPlans balanced so far
long totalRegPlanExecTime = 0;
if (plans != null && !plans.isEmpty()) {
for (RegionPlan plan: plans) {
LOG.info("balance " + plan);
long balStartTime = System.currentTimeMillis();
//TODO: bulk assign
//执行计划
this.assignmentManager.balance(plan);
}
//.....
.....
}
7)生成计划过程
@Override
public synchronized List<RegionPlan> balanceCluster(Map<ServerName,
List<HRegionInfo>> clusterState) {
//首先移动一些本应该在Master主机的regionService上的表。或者本不应该在上面的需要移动
//判断依据 master包含这个表,副本为0
//
return tablesOnMaster.contains(region.getTable().getNameAsString())//
&& region.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID;
List<RegionPlan> plans = balanceMasterRegions(clusterState);
if (plans != null || clusterState == null || clusterState.size() <= 1) {
return plans;
}
if (masterServerName != null && clusterState.containsKey(masterServerName)) {
if (clusterState.size() <= 2) {
return null;
}
clusterState = new HashMap<ServerName, List<HRegionInfo>>(clusterState);
clusterState.remove(masterServerName);
}
RegionLocationFinder finder = null;
if (this.localityCost != null && this.localityCost.getMultiplier() > 0) {
finder = this.regionFinder;
}
//检查整个集群。
Cluster cluster = new Cluster(clusterState, loads, finder, rackManager);
- //判断是否需要均衡。
- //判断依据,当前表在每个RegionService的region数目是否均衡。
//float average = cs.getLoadAverage(); // for logging //int floor = (int) Math.floor(average * (1 - slop)); //int ceiling = (int) Math.ceil(average * (1 + slop));- if (!needsBalance(cluster)) {
return null;
}
//既然需要移动,肯定需要计算价值,下面就是算价值
long startTime = EnvironmentEdgeManager.currentTime();
initCosts(cluster);
double currentCost = computeCost(cluster, Double.MAX_VALUE);
curOverallCost = currentCost;
for (int i = 0; i < this.curFunctionCosts.length; i++) {
curFunctionCosts[i] = tempFunctionCosts[i];
}
double initCost = currentCost;
double newCost = currentCost;
//如果要移动最大不超过 800步×
numRegions×numServerslong computedMaxSteps = Math.min(this.maxSteps,
((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));
// Perform a stochastic walk to see if we can get a good fit.
long step;
for (step = 0; step < computedMaxSteps; step++) {
int generatorIdx = RANDOM.nextInt(candidateGenerators.length);
CandidateGenerator p = candidateGenerators[generatorIdx];
Cluster.Action action = p.generate(cluster);
if (action.type == Type.NULL) {
continue;
}
cluster.doAction(action);
updateCostsWithAction(cluster, action);
newCost = computeCost(cluster, currentCost);
// Should this be kept?
if (newCost < currentCost) {
currentCost = newCost;
// save for JMX
curOverallCost = currentCost;
for (int i = 0; i < this.curFunctionCosts.length; i++) {
curFunctionCosts[i] = tempFunctionCosts[i];
}
} else {
// Put things back the way they were before.
// TODO: undo by remembering old values
Action undoAction = action.undoAction();
cluster.doAction(undoAction);
updateCostsWithAction(cluster, undoAction);
}
if (EnvironmentEdgeManager.currentTime() - startTime >
maxRunningTime) {
break;
}
}
long endTime = EnvironmentEdgeManager.currentTime();
metricsBalancer.balanceCluster(endTime - startTime);
// update costs metrics
updateStochasticCosts(tableName, curOverallCost, curFunctionCosts);
//算完代价后会将移动计划写清楚
if (initCost > currentCost) {
plans = createRegionPlans(cluster);
if (LOG.isDebugEnabled()) {
LOG.debug("Finished computing new load balance plan. Computation took "
+ (endTime - startTime) + "ms to try " + step
+ " different iterations. Found a solution that moves "
+ plans.size() + " regions; Going from a computed cost of "
+ initCost + " to a new cost of " + currentCost);
}
return plans;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Could not find a better load balance plan. Tried "
+ step + " different configurations in " + (endTime - startTime)
+ "ms, and did not find anything with a computed cost less than " + initCost);
}
return null;
}
8)执行计划
public void assign(RegionState state,
boolean setOfflineInZK, final boolean forceNewPlan) {
long startTime = EnvironmentEdgeManager.currentTime();
try {
HRegionInfo region = state.getRegion();
//更新状态
currentState = regionStates.updateRegionState(region,
State.PENDING_OPEN, plan.getDestination());
boolean needNewPlan;
final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() +
" to " + plan.getDestination();
try {
List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
if (this.shouldAssignRegionsWithFavoredNodes) {
favoredNodes = ((FavoredNodeLoadBalancer)this.balancer).getFavoredNodes(region);
}
- //真正执行
regionOpenState = serverManager.sendRegionOpen(
plan.getDestination(), region, versionOfOfflineNode, favoredNodes);
//失败需要在试一试
if (regionOpenState == RegionOpeningState.FAILED_OPENING) {
// Failed opening this region, looping again on a new server.
needNewPlan = true;
LOG.warn(assignMsg + ", regionserver says 'FAILED_OPENING', " +
" trying to assign elsewhere instead; " +
"try=" + i + " of " + this.maximumAttempts);
} else {
//完成了
// we're done
if (regionOpenState == RegionOpeningState.ALREADY_OPENED) {
processAlreadyOpenedRegion(region, plan.getDestination());
}
return;
}
}
//
MetaRegion 需要一直等到,不能失败。if (i == this.maximumAttempts) {
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) {
waitForRetryingMetaAssignment();
}
else {
// Don't reset the region state or get a new plan any more.
// This is the last try.
continue;
}
}
if (needNewPlan) {
RegionPlan newPlan = null;
try {
newPlan = getRegionPlan(region, true);
} catch (HBaseIOException e) {
LOG.warn("Failed to get region plan", e);
}
if (newPlan == null) {
regionStates.updateRegionState(region, State.FAILED_OPEN);
LOG.warn("Unable to find a viable location to assign region " +
region.getRegionNameAsString());
return;
}
if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
// Clean out plan we failed execute and one that doesn't look like it'll
// succeed anyways; we need a new plan!
// Transition back to OFFLINE
LOG.info("Region assignment plan changed from " + plan.getDestination() + " to "
+ newPlan.getDestination() + " server.");
currentState = regionStates.updateRegionState(region, State.OFFLINE);
versionOfOfflineNode = -1;
if (useZKForAssignment) {
setOfflineInZK = true;
}
plan = newPlan;
} else if(plan.getDestination().equals(newPlan.getDestination()) &&
previousException instanceof FailedServerException) {
}
}
}
// Run out of attempts
regionStates.updateRegionState(region, State.FAILED_OPEN);
} finally {
metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTime() - startTime);
}
}
到此结束