0. 前言
- SpringBoot版本:2.1.9.RELEASE
- SpringCloud版本:Greenwich.SR4
1. 核心简述
- 实现负载均衡核心:
- RestTemplate:依赖 LoadBalancerInterceptor 拦截请求做相应处理
- Feign:注解了 @FeignClient 的接口创建了代理实现类 ReflectiveFeign.FeignInvocationHandler ,实现类中包含每个方法的代理方法 SynchronousMethodHandler,请求接口中的方法时调用代理实现类中相应的代理方法做相应处理
- 实现负载均衡的核心逻辑是通过负载均衡算法,取出实际需要调用的服务实例,并替换原有请求的 uri
2. 负载均衡策略
- RoundRobinRule:轮询
- RandomRule:随机
- RetryRule:一定时间内不断重试轮询获取服务实例
- BestAvailableRule:获取未熔断且有效请求并发量最小的服务实例
- AvailabilityFilteringRule:轮询遍历服务实例,取出未熔断且有效请求并发量小于阈值的服务实例
- WeightedResponseTimeRule:
- 当服务刚启动时,统计信息不足,使用 RoundRobinRule 策略轮询获取服务实例
- 当服务启动一段时间后,统计信息充足,根据平均响应时间计算出每个服务实例的权重,服务实例平均响应时间越小的权重越大被选中的概率越高
- ZoneAvoidanceRule:默认策略,根据服务实例所在分区(region)的性能和服务实例的可用性,复合判断
2.1 RoundRobinRule
public class RoundRobinRule extends AbstractLoadBalancerRule {
// ......
public Server choose(ILoadBalancer lb, Object key) {
if (lb == null) {
log.warn("no load balancer");
return null;
}
Server server = null;
int count = 0;
// 最多重试10次
while (server == null && count++ < 10) {
// 获取可访问服务实例
List<Server> reachableServers = lb.getReachableServers();
// 获取所有服务实例
List<Server> allServers = lb.getAllServers();
int upCount = reachableServers.size();
int serverCount = allServers.size();
// 如果可访问服务实例数为0,返回 null
if ((upCount == 0) || (serverCount == 0)) {
log.warn("No up servers available from load balancer: " + lb);
return null;
}
// 计算并取出下标
int nextServerIndex = incrementAndGetModulo(serverCount);
server = allServers.get(nextServerIndex);
if (server == null) {
/* Transient. */
Thread.yield();
continue;
}
// 如果服务实例存活且准备好接收请求,返回该服务实例
// 否则重试
if (server.isAlive() && (server.isReadyToServe())) {
return (server);
}
// Next.
server = null;
}
if (count >= 10) {
log.warn("No available alive servers after 10 tries from load balancer: "
+ lb);
}
return server;
}
private int incrementAndGetModulo(int modulo) {
for (;;) {
// nextServerCyclicCounter:AtomicInteger 类型,线程安全
int current = nextServerCyclicCounter.get();
// 取模实现轮询
int next = (current + 1) % modulo;
if (nextServerCyclicCounter.compareAndSet(current, next))
return next;
}
}
// ......
}
2.2 RandomRule
public class RandomRule extends AbstractLoadBalancerRule {
/**
* Randomly choose from all living servers
*/
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "RCN_REDUNDANT_NULLCHECK_OF_NULL_VALUE")
public Server choose(ILoadBalancer lb, Object key) {
if (lb == null) {
return null;
}
Server server = null;
// 不断重试
while (server == null) {
if (Thread.interrupted()) {
return null;
}
// 获取可访问服务实例
List<Server> upList = lb.getReachableServers();
// 获取所有服务实例
List<Server> allList = lb.getAllServers();
int serverCount = allList.size();
// 如果所有服务实例数为0,返回 null
if (serverCount == 0) {
/*
* No servers. End regardless of pass, because subsequent passes
* only get more restrictive.
*/
return null;
}
// 随机生成下标
int index = chooseRandomInt(serverCount);
server = upList.get(index);
if (server == null) {
/*
* The only time this should happen is if the server list were
* somehow trimmed. This is a transient condition. Retry after
* yielding.
*/
Thread.yield();
continue;
}
if (server.isAlive()) {
// 如果服务实例存活,返回该服务
return (server);
}
// Shouldn't actually happen.. but must be transient or a bug.
server = null;
Thread.yield();
}
return server;
}
protected int chooseRandomInt(int serverCount) {
return ThreadLocalRandom.current().nextInt(serverCount);
}
// ......
}
2.3 RetryRule
public class RetryRule extends AbstractLoadBalancerRule {
// ......
public Server choose(ILoadBalancer lb, Object key) {
long requestTime = System.currentTimeMillis();
// 计算获取服务实例整个操作的截止时间,当前时间+500ms
long deadline = requestTime + maxRetryMillis;
Server answer = null;
answer = subRule.choose(key);
if (((answer == null) || (!answer.isAlive()))
&& (System.currentTimeMillis() < deadline)) {
// 如果取出第一个服务实例存在且存活,而且执行完取出操作未到截止时间
// 新建线程的中断任务,指定多久后中断线程
InterruptTask task = new InterruptTask(deadline
- System.currentTimeMillis());
// 线程未中断,不断轮询获取满足条件的服务实例
// 直到截止时间,取消线程的中断任务,不再轮询获取满足条件的服务实例
while (!Thread.interrupted()) {
answer = subRule.choose(key);
if (((answer == null) || (!answer.isAlive()))
&& (System.currentTimeMillis() < deadline)) {
/* pause and retry hoping it's transient */
Thread.yield();
} else {
break;
}
}
task.cancel();
}
if ((answer == null) || (!answer.isAlive())) {
return null;
} else {
return answer;
}
}
// ......
}
2.4 BestAvailableRule
public class BestAvailableRule extends ClientConfigEnabledRoundRobinRule {
// ......
@Override
public Server choose(Object key) {
if (loadBalancerStats == null) {
return super.choose(key);
}
// 获取所有的服务实例
List<Server> serverList = getLoadBalancer().getAllServers();
// 初始赋值 最小并发连接数 = Integer.MAX_VALUE
int minimalConcurrentConnections = Integer.MAX_VALUE;
long currentTime = System.currentTimeMillis();
Server chosen = null;
// 遍历所有的服务实例
// 根据统计数据,取出当前未熔断且并发量(有效请求连接数)最小的服务实例
for (Server server: serverList) {
ServerStats serverStats = loadBalancerStats.getSingleServerStat(server);
if (!serverStats.isCircuitBreakerTripped(currentTime)) {
int concurrentConnections = serverStats.getActiveRequestsCount(currentTime);
if (concurrentConnections < minimalConcurrentConnections) {
minimalConcurrentConnections = concurrentConnections;
chosen = server;
}
}
}
if (chosen == null) {
// 如果上面处理后没有获取到满足条件的服务实例
// 则遍历轮询取出相应服务实例
return super.choose(key);
} else {
return chosen;
}
}
// ......
}
2.5 AvailabilityFilteringRule
public class AvailabilityFilteringRule extends PredicateBasedRule {
private AbstractServerPredicate predicate;
@Override
public void initWithNiwsConfig(IClientConfig clientConfig) {
// 指定过滤(断言)方法为 AvailabilityPredicate
predicate = CompositePredicate.withPredicate(new AvailabilityPredicate(this, clientConfig))
.addFallbackPredicate(AbstractServerPredicate.alwaysTrue())
.build();
}
// ......
/**
* This method is overridden to provide a more efficient implementation which does not iterate through
* all servers. This is under the assumption that in most cases, there are more available instances
* than not.
*/
@Override
public Server choose(Object key) {
int count = 0;
Server server = roundRobinRule.choose(key);
// 轮询获取服务实例最多10次
while (count++ <= 10) {
if (predicate.apply(new PredicateKey(server))) {
// 如果 AvailabilityPredicate.apply() 方法返回 true ,则返回该服务实例
return server;
}
server = roundRobinRule.choose(key);
}
// 如果上面获取不到满足条件的服务实例,则轮询获取
return super.choose(key);
}
// ......
}
public class AvailabilityPredicate extends AbstractServerPredicate {
// ......
@Override
public boolean apply(@Nullable PredicateKey input) {
// 获取统计数据
LoadBalancerStats stats = getLBStats();
if (stats == null) {
return true;
}
// 根据相关统计数据判断是否过滤该服务实例
return !shouldSkipServer(stats.getSingleServerStat(input.getServer()));
}
private boolean shouldSkipServer(ServerStats stats) {
if ((CIRCUIT_BREAKER_FILTERING.get() && stats.isCircuitBreakerTripped())
|| stats.getActiveRequestsCount() >= activeConnectionsLimit.get()) {
// 如果服务实例熔断 或 有效请求并发量大于等于阈值,返回 true
// 表示要过滤该服务实例
return true;
}
return false;
}
}
2.6 WeightedResponseTimeRule
public class WeightedResponseTimeRule extends RoundRobinRule {
// ......
// holds the accumulated weight from index 0 to current index
// for example, element at index 2 holds the sum of weight of servers from 0 to 2
// 权重积累列表
private volatile List<Double> accumulatedWeights = new ArrayList<Double>();
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "RCN_REDUNDANT_NULLCHECK_OF_NULL_VALUE")
@Override
public Server choose(ILoadBalancer lb, Object key) {
if (lb == null) {
return null;
}
Server server = null;
while (server == null) {
// get hold of the current reference in case it is changed from the other thread
List<Double> currentWeights = accumulatedWeights;
if (Thread.interrupted()) {
return null;
}
List<Server> allList = lb.getAllServers();
int serverCount = allList.size();
if (serverCount == 0) {
return null;
}
int serverIndex = 0;
// last one in the list is the sum of all weights
// 获取服务实例权重之和
double maxTotalWeight = currentWeights.size() == 0 ? 0 : currentWeights.get(currentWeights.size() - 1);
// No server has been hit yet and total weight is not initialized
// fallback to use round robin
if (maxTotalWeight < 0.001d || serverCount != currentWeights.size()) {
// 如果 所有服务实例权重之和小于0.001 或 所有服务实例数不等于当前权重积累列表的大小
// 则替换轮询策略获取服务实例
// 所有服务实例权重之和小于0.001的原因:
// 服务刚启动时,统计信息不足
// 所有服务实例数不等于当前权重积累列表的大小的原因:
// currentWeights 可能是旧的,当服务实例数量变更时,未执行或未完成执行计算服务实例权重的定时任务导致的
server = super.choose(getLoadBalancer(), key);
if(server == null) {
return server;
}
} else {
// generate a random weight between 0 (inclusive) to maxTotalWeight (exclusive)
// 在 [0,服务实例权重之和) 区间取一个随机数 randomWeight
double randomWeight = random.nextDouble() * maxTotalWeight;
// pick the server index based on the randomIndex
int n = 0;
// 遍历权重积累列表,根据 randomWeight 取出目标服务实例下标
for (Double d : currentWeights) {
if (d >= randomWeight) {
serverIndex = n;
break;
} else {
n++;
}
}
// 根据下标获取服务实例
server = allList.get(serverIndex);
}
if (server == null) {
/* Transient. */
Thread.yield();
continue;
}
if (server.isAlive()) {
// 如果服务实例存活,返回
return (server);
}
// Next.
server = null;
}
return server;
}
// 执行计算服务实例权重的定时任务,每30s一次
class DynamicServerWeightTask extends TimerTask {
public void run() {
ServerWeight serverWeight = new ServerWeight();
try {
serverWeight.maintainWeights();
} catch (Exception e) {
logger.error("Error running DynamicServerWeightTask for {}", name, e);
}
}
}
class ServerWeight {
public void maintainWeights() {
ILoadBalancer lb = getLoadBalancer();
if (lb == null) {
return;
}
// 判断是否正在计算服务实例权重
if (!serverWeightAssignmentInProgress.compareAndSet(false, true)) {
return;
}
try {
logger.info("Weight adjusting job started");
AbstractLoadBalancer nlb = (AbstractLoadBalancer) lb;
// 获取统计数据
LoadBalancerStats stats = nlb.getLoadBalancerStats();
if (stats == null) {
// no statistics, nothing to do
return;
}
double totalResponseTime = 0;
// find maximal 95% response time
// 遍历所有服务实例,计算总平均响应时间
for (Server server : nlb.getAllServers()) {
// this will automatically load the stats if not in cache
ServerStats ss = stats.getSingleServerStat(server);
totalResponseTime += ss.getResponseTimeAvg();
}
// weight for each server is (sum of responseTime of all servers - responseTime)
// so that the longer the response time, the less the weight and the less likely to be chosen
Double weightSoFar = 0.0;
// create new list and hot swap the reference
List<Double> finalWeights = new ArrayList<Double>();
for (Server server : nlb.getAllServers()) {
ServerStats ss = stats.getSingleServerStat(server);
// 权重 = 总平均响应时间 - 当前服务实例平均响应时间
double weight = totalResponseTime - ss.getResponseTimeAvg();
// 权重积累 = 下标小于当前下标的所有服务实例的权重之和 + 当前服务实例权重
weightSoFar += weight;
finalWeights.add(weightSoFar);
}
setWeights(finalWeights);
} catch (Exception e) {
logger.error("Error calculating server weights", e);
} finally {
serverWeightAssignmentInProgress.set(false);
}
}
}
// ......
}
假如有3个服务实例,平均响应时间分别为 A=30s , B=100s , C=50s
A | B | C | |
---|---|---|---|
平均响应时间(ms) | 30 | 100 | 50 |
weight 权重 | 150 | 80 | 130 |
weightSoFar 权重积累 | 150 | 230 | 360 |
区间范围 | [0,150) | [150,230) | [230,360) |
如果在 [0,360) 区间获取到随机数 randomWeight=180 , randomWeight<230 ,则获取到的服务实例为 B
综上所述,服务实例平均响应时间越小,权重越大被选中的概率越高
2.7 ZoneAvoidanceRule
public class ZoneAvoidanceRule extends PredicateBasedRule {
// ......
public ZoneAvoidanceRule() {
super();
ZoneAvoidancePredicate zonePredicate = new ZoneAvoidancePredicate(this);
AvailabilityPredicate availabilityPredicate = new AvailabilityPredicate(this);
// 指定两个过滤(断言)方法,以及回退过滤方法
compositePredicate = createCompositePredicate(zonePredicate, availabilityPredicate);
}
private CompositePredicate createCompositePredicate(ZoneAvoidancePredicate p1, AvailabilityPredicate p2) {
return CompositePredicate.withPredicates(p1, p2)
.addFallbackPredicate(p2)
.addFallbackPredicate(AbstractServerPredicate.alwaysTrue())
.build();
}
// ......
}
ZoneAvoidanceRule 未实现 choose() 方法,实现该方法的是它的父类 PredicateBasedRule
public abstract class PredicateBasedRule extends ClientConfigEnabledRoundRobinRule {
public abstract AbstractServerPredicate getPredicate();
@Override
public Server choose(Object key) {
ILoadBalancer lb = getLoadBalancer();
// 2.7.1 根据提前规定的条件获取服务实例
Optional<Server> server = getPredicate().chooseRoundRobinAfterFiltering(lb.getAllServers(), key);
if (server.isPresent()) {
return server.get();
} else {
return null;
}
}
}
2.7.1 chooseRoundRobinAfterFiltering()
public abstract class AbstractServerPredicate implements Predicate<PredicateKey> {
// ......
public Optional<Server> chooseRoundRobinAfterFiltering(List<Server> servers, Object loadBalancerKey) {
// 2.7.2 获取筛选后的服务实例
List<Server> eligible = getEligibleServers(servers, loadBalancerKey);
if (eligible.size() == 0) {
return Optional.absent();
}
// 轮询获取其中一个
return Optional.of(eligible.get(incrementAndGetModulo(eligible.size())));
}
// ......
}
2.7.2 getEligibleServers()
public class CompositePredicate extends AbstractServerPredicate {
// ......
// 先根据规定的过滤方法筛选服务实例
// 如果过滤后服务实例的数量太少或比例太少,则根据回退过滤方法筛选
@Override
public List<Server> getEligibleServers(List<Server> servers, Object loadBalancerKey) {
// 2.7.3
List<Server> result = super.getEligibleServers(servers, loadBalancerKey);
Iterator<AbstractServerPredicate> i = fallbacks.iterator();
while (!(result.size() >= minimalFilteredServers && result.size() > (int) (servers.size() * minimalFilteredPercentage))
&& i.hasNext()) {
AbstractServerPredicate predicate = i.next();
result = predicate.getEligibleServers(servers, loadBalancerKey);
}
return result;
}
}
2.7.3 super.getEligibleServers()
// AbstractServerPredicate.class
public List<Server> getEligibleServers(List<Server> servers, Object loadBalancerKey) {
if (loadBalancerKey == null) {
return ImmutableList.copyOf(Iterables.filter(servers, this.getServerOnlyPredicate()));
} else {
List<Server> results = Lists.newArrayList();
// 遍服务实例,根据过滤方法筛选
for (Server server: servers) {
if (this.apply(new PredicateKey(loadBalancerKey, server))) {
results.add(server);
}
}
return results;
}
}
// CompositePredicate.class
public boolean apply(@Nullable PredicateKey input) {
return delegate.apply(input);
}
// AbstractServerPredicate.class
public static AbstractServerPredicate ofKeyPredicate(final Predicate<PredicateKey> p) {
return new AbstractServerPredicate() {
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP")
public boolean apply(PredicateKey input) {
return p.apply(input);
}
};
}
// Predicates.AndPredicate.class
public boolean apply(@Nullable T t) {
// 遍历过滤方法筛选服务实例,调用 apply() 方法筛选
// 2.7.4 过滤方法:ZoneAvoidancePredicate 和 AvailabilityPredicate
for(int i = 0; i < this.components.size(); ++i) {
if(!((Predicate)this.components.get(i)).apply(t)) {
return false;
}
}
return true;
}
2.7.4 ZoneAvoidancePredicate 和 AvailabilityPredicate
2.7.4.1 ZoneAvoidancePredicate
public class ZoneAvoidancePredicate extends AbstractServerPredicate {
// ......
@Override
public boolean apply(@Nullable PredicateKey input) {
if (!ENABLED.get()) {
return true;
}
// 获取当前服务实例所在分区
String serverZone = input.getServer().getZone();
if (serverZone == null) {
// there is no zone information from the server, we do not want to filter
// out this server
return true;
}
// 获取统计数据
LoadBalancerStats lbStats = getLBStats();
if (lbStats == null) {
// no stats available, do not filter
return true;
}
if (lbStats.getAvailableZones().size() <= 1) {
// only one zone is available, do not filter
// 如果只有一个分区可用,返回true
return true;
}
// 从统计数据获取分区
Map<String, ZoneSnapshot> zoneSnapshot = ZoneAvoidanceRule.createSnapshot(lbStats);
if (!zoneSnapshot.keySet().contains(serverZone)) {
// 如果统计数据中的分区不包含当前服务实例所在分区
// 返回 true , 可能是统计数据中还未统计到改分区信息,则认为当前服务实例不用过滤
// The server zone is unknown to the load balancer, do not filter it out
return true;
}
logger.debug("Zone snapshots: {}", zoneSnapshot);
// 2.7.4.1.1 根据一定条件过滤移除坏分区
Set<String> availableZones = ZoneAvoidanceRule.getAvailableZones(zoneSnapshot, triggeringLoad.get(), triggeringBlackoutPercentage.get());
logger.debug("Available zones: {}", availableZones);
if (availableZones != null) {
// 过滤后的分区中包含当前服务实例所在分区,返回 true ,否则 false
return availableZones.contains(input.getServer().getZone());
} else {
return false;
}
}
}
2.7.4.1.1 ZoneAvoidanceRule.getAvailableZones()
// ZoneAvoidanceRule.class
public static Set<String> getAvailableZones(
Map<String, ZoneSnapshot> snapshot, double triggeringLoad,
double triggeringBlackoutPercentage) {
if (snapshot.isEmpty()) {
return null;
}
Set<String> availableZones = new HashSet<String>(snapshot.keySet());
if (availableZones.size() == 1) {
// 如果只有一个分区,则不过滤直接返回
return availableZones;
}
Set<String> worstZones = new HashSet<String>();
double maxLoadPerServer = 0;
boolean limitedZoneAvailability = false;
for (Map.Entry<String, ZoneSnapshot> zoneEntry : snapshot.entrySet()) {
String zone = zoneEntry.getKey();
ZoneSnapshot zoneSnapshot = zoneEntry.getValue();
int instanceCount = zoneSnapshot.getInstanceCount();
if (instanceCount == 0) {
// 如果某个分区的服务实例数为0,则认为该分区为坏分区,过滤移除
availableZones.remove(zone);
limitedZoneAvailability = true;
} else {
double loadPerServer = zoneSnapshot.getLoadPerServer();
if (((double) zoneSnapshot.getCircuitTrippedCount())
/ instanceCount >= triggeringBlackoutPercentage
|| loadPerServer < 0) {
// 如果某个分区满足下列其中一个条件,则认为该分区为坏分区,过滤移除
// 1. 熔断率大于0.99999
// 2. 服务实例平均负载小于0,表示所有服务实例熔断了,
availableZones.remove(zone);
limitedZoneAvailability = true;
} else {
if (Math.abs(loadPerServer - maxLoadPerServer) < 0.000001d) {
// they are the same considering double calculation
// round error
// 如果 当前分区的服务实例平均负载 和 最大服务实例平均负载 的绝对值小于0.000001
// 表示它们坏的程度很接近
// 那么都需要添加到最坏分区列表中
worstZones.add(zone);
} else if (loadPerServer > maxLoadPerServer) {
// 如果 当前分区的服务实例平均负载 大于 最大服务实例平均负载
// 那么 当前分区的服务实例平均负载 成为新的 最大服务实例平均负载
// 并且清空最坏分区列表,只添加当前分区
maxLoadPerServer = loadPerServer;
worstZones.clear();
worstZones.add(zone);
}
}
}
}
if (maxLoadPerServer < triggeringLoad && !limitedZoneAvailability) {
// zone override is not needed here
// 如果 最大服务实例平均负载小于0.2 且还没有移除任何坏分区
// 则不需要过滤任何分区
return availableZones;
}
String zoneToAvoid = randomChooseZone(snapshot, worstZones);
if (zoneToAvoid != null) {
// 从最坏分区列表中随机移除一个分区
availableZones.remove(zoneToAvoid);
}
return availableZones;
}
2.7.4.1 AvailabilityPredicate
上面已分析