上篇讲到的是服务正常下线的情况,如果遇到网络不稳定或机器宕机了,导致了server一直接收不到服务的心跳,这时就可以认为该服务实例故障了,需要剔除了。
在EurekaServer启动流程一节初始化eureka server上下文中,有这样一行代码
this.registry.openForTraffic(this.applicationInfoManager, registryCount);
这行代码就会创建一个定时剔除过期服务的定时任务
@Override
public void openForTraffic(ApplicationInfoManager applicationInfoManager, int count) {
// 如果注册数为0,则默认设置注册数为1
super.openForTraffic(applicationInfoManager,
count == 0 ? this.defaultOpenForTrafficCount : count);
}
@Override
public void openForTraffic(ApplicationInfoManager applicationInfoManager, int count) {
// 计算每分钟期望收到的续约次数
this.expectedNumberOfRenewsPerMin = count * 2;
// 计算每分钟的续约阈值
this.numberOfRenewsPerMinThreshold =
(int) (this.expectedNumberOfRenewsPerMin * serverConfig.getRenewalPercentThreshold());
logger.info("Got {} instances from neighboring DS node", count);
logger.info("Renew threshold is: {}", numberOfRenewsPerMinThreshold);
this.startupTime = System.currentTimeMillis();
if (count > 0) {
this.peerInstancesTransferEmptyOnStartup = false;
}
DataCenterInfo.Name selfName = applicationInfoManager.getInfo().getDataCenterInfo().getName();
boolean isAws = Name.Amazon == selfName;
if (isAws && serverConfig.shouldPrimeAwsReplicaConnections()) {
logger.info("Priming AWS connections for all replicas..");
primeAwsReplicas(applicationInfoManager);
}
logger.info("Changing status to UP");
// 状态设为UP
applicationInfoManager.setInstanceStatus(InstanceStatus.UP);
// 调用父类初始化,定时剔除过期服务的定时任务在这里面
super.postInit();
}
protected void postInit() {
// 计算最近一分钟的续约数,其实就是将当前的续约数设置到最近一分钟续约数变量里
renewsLastMin.start();
// 若存在剔除任务则先将该任务取消
if (evictionTaskRef.get() != null) {
evictionTaskRef.get().cancel();
}
// 创建并启动服务过期剔除任务,默认每隔60s执行一次
evictionTaskRef.set(new EvictionTask());
evictionTimer.schedule(evictionTaskRef.get(),
serverConfig.getEvictionIntervalTimerInMs(),
serverConfig.getEvictionIntervalTimerInMs());
}
先看下如何计算最近一分钟的续约数
public class MeasuredRate {
private static final Logger logger = LoggerFactory.getLogger(MeasuredRate.class);
// 最近一分钟续约数
private final AtomicLong lastBucket = new AtomicLong(0);
// 当前续约数
private final AtomicLong currentBucket = new AtomicLong(0);
private final long sampleInterval;
private final Timer timer;
private volatile boolean isActive;
/**
* @param sampleInterval in milliseconds
*/
public MeasuredRate(long sampleInterval) {
this.sampleInterval = sampleInterval;
this.timer = new Timer("Eureka-MeasureRateTimer", true);
this.isActive = false;
}
public synchronized void start() {
if (!isActive) {
// 启动一个定时任务来计算最近一分钟续约数
timer.schedule(new TimerTask() {
@Override
public void run() {
try {
// 就是将当前的续约数设置到最近一分钟续约数变量里,
// 并将当前续约数清零
lastBucket.set(currentBucket.getAndSet(0));
} catch (Throwable e) {
logger.error("Cannot reset the Measured Rate", e);
}
}
}, sampleInterval, sampleInterval);
isActive = true;
}
}
public synchronized void stop() {
if (isActive) {
timer.cancel();
isActive = false;
}
}
/**
* Returns the count in the last sample interval.
*/
public long getCount() {
return lastBucket.get();
}
/**
* 服务续约时就会调用该方法,当前续约数+1
*/
public void increment() {
currentBucket.incrementAndGet();
}
}
服务过期剔除任务,默认每隔60s执行一次
class EvictionTask extends TimerTask {
private final AtomicLong lastExecutionNanosRef = new AtomicLong(0l);
@Override
public void run() {
try {
// 计算补偿时间
long compensationTimeMs = getCompensationTimeMs();
logger.info("Running the evict task with compensationTime {}ms", compensationTimeMs);
evict(compensationTimeMs);
} catch (Throwable e) {
logger.error("Could not run the evict task", e);
}
}
long getCompensationTimeMs() {
long currNanos = getCurrentTimeNano();
long lastNanos = lastExecutionNanosRef.getAndSet(currNanos);
if (lastNanos == 0l) {
return 0l;
}
long elapsedMs = TimeUnit.NANOSECONDS.toMillis(currNanos - lastNanos);
long compensationTime = elapsedMs - serverConfig.getEvictionIntervalTimerInMs();
return compensationTime <= 0l ? 0l : compensationTime;
}
long getCurrentTimeNano() { // for testing
return System.nanoTime();
}
}
public void evict(long additionalLeaseMs) {
logger.debug("Running the evict task");
// 是否启用了租约过期,其实就是判断是否启用了自我保护机制
if (!isLeaseExpirationEnabled()) {
logger.debug("DS: lease expiration is currently disabled.");
return;
}
// We collect first all expired items, to evict them in random order. For large eviction sets,
// if we do not that, we might wipe out whole apps before self preservation kicks in. By randomizing it,
// the impact should be evenly distributed across all applications.
// 先筛选出所有过期的服务实例
List<Lease<InstanceInfo>> expiredLeases = new ArrayList<>();
for (Entry<String, Map<String, Lease<InstanceInfo>>> groupEntry : registry.entrySet()) {
Map<String, Lease<InstanceInfo>> leaseMap = groupEntry.getValue();
if (leaseMap != null) {
for (Entry<String, Lease<InstanceInfo>> leaseEntry : leaseMap.entrySet()) {
Lease<InstanceInfo> lease = leaseEntry.getValue();
// 判断是否过期,这里有一个bug,原本过期时间应该是90s,
// 但由于续约时将更新时间设置为了当前时间+90s,所以导致了实际要180s后才会过期,
// 但是该错误影响不大,所以官方就没有修复了
if (lease.isExpired(additionalLeaseMs) && lease.getHolder() != null) {
expiredLeases.add(lease);
}
}
}
}
// To compensate for GC pauses or drifting local time, we need to use current registry size as a base for
// triggering self-preservation. Without that we would wipe out full registry.
// 获取注册表的实例数
int registrySize = (int) getLocalRegistrySize();
// 计算每次剔除剩余注册数阈值(默认是已注册实例数*0.85)
int registrySizeThreshold = (int) (registrySize * serverConfig.getRenewalPercentThreshold());
// 每次剔除数量限制
int evictionLimit = registrySize - registrySizeThreshold;
// 剔除数量为实际过期实例数和剔除数量限制中的最小值,也就是说一次最多只能剔除 15% 的实例
// 这样是防止一次剔除过多,导致某一类服务不可用
int toEvict = Math.min(expiredLeases.size(), evictionLimit);
if (toEvict > 0) {
logger.info("Evicting {} items (expired={}, evictionLimit={})", toEvict, expiredLeases.size(), evictionLimit);
// 然后从过期列表中随机取出toEvict个实例下线
Random random = new Random(System.currentTimeMillis());
for (int i = 0; i < toEvict; i++) {
// Pick a random item (Knuth shuffle algorithm)
int next = i + random.nextInt(expiredLeases.size() - i);
Collections.swap(expiredLeases, i, next);
Lease<InstanceInfo> lease = expiredLeases.get(i);
String appName = lease.getHolder().getAppName();
String id = lease.getHolder().getId();
EXPIRED.increment();
logger.warn("DS: Registry: expired lease for {}/{}", appName, id);
// 服务下线,跟服务下线逻辑一样
internalCancel(appName, id, false);
}
}
}