目录
前言
本章节与服务注册息息相关,里面有一些方法的源码详解在服务注册章节中,建议先看完服务注册章节后再观看此章节。
客户端心跳发送
com.alibaba.nacos.client.naming.beat.BeatReactor#addBeatInfo
public void addBeatInfo(String serviceName, BeatInfo beatInfo) {
NAMING_LOGGER.info("[BEAT] adding beat: {} to beat map.", beatInfo);
// 生成唯一key
String key = buildKey(serviceName, beatInfo.getIp(), beatInfo.getPort());
BeatInfo existBeat = null;
//fix #1733
if ((existBeat = dom2Beat.remove(key)) != null) {
existBeat.setStopped(true);
}
dom2Beat.put(key, beatInfo);
// 利用延时的定时任务周期向服务端发送心跳包(默认延时5s后执行)
executorService.schedule(new BeatTask(beatInfo), beatInfo.getPeriod(), TimeUnit.MILLISECONDS);
MetricsMonitor.getDom2BeatSizeMonitor().set(dom2Beat.size());
}
定时发送心跳包的核心类为BeatTask类。该类实现了Runnable接口,具体实现的源码路径为
com.alibaba.nacos.client.naming.beat.BeatReactor.BeatTask#run
@Override
public void run() {
if (beatInfo.isStopped()) {
return;
}
long nextTime = beatInfo.getPeriod();
try {
// 向服务端发送请求(首次注册为注册时心跳,会将整个beatInfo对象发送给服务端),api地址:/nacos/v1/ns/instance/beat
// nacos的心跳分为两种,第一种是注册时心跳,目的是为了首次注册进行上报,第二种为轻量级心跳,目的是为了保持连接
JSONObject result = serverProxy.sendBeat(beatInfo, BeatReactor.this.lightBeatEnabled);
long interval = result.getIntValue("clientBeatInterval");
boolean lightBeatEnabled = false;
if (result.containsKey(CommonParams.LIGHT_BEAT_ENABLED)) {
lightBeatEnabled = result.getBooleanValue(CommonParams.LIGHT_BEAT_ENABLED);
}
// 首次注册时心跳,服务端会返回lightBeatEnabled=true,标注下次心跳为轻量级心跳
BeatReactor.this.lightBeatEnabled = lightBeatEnabled;
if (interval > 0) {
nextTime = interval;
}
int code = NamingResponseCode.OK;
if (result.containsKey(CommonParams.CODE)) {
code = result.getIntValue(CommonParams.CODE);
}
// 找不到资源时会重新注册,这里会在服务端具体讲解
if (code == NamingResponseCode.RESOURCE_NOT_FOUND) {
Instance instance = new Instance();
instance.setPort(beatInfo.getPort());
instance.setIp(beatInfo.getIp());
instance.setWeight(beatInfo.getWeight());
instance.setMetadata(beatInfo.getMetadata());
instance.setClusterName(beatInfo.getCluster());
instance.setServiceName(beatInfo.getServiceName());
instance.setInstanceId(instance.getInstanceId());
instance.setEphemeral(true);
try {
serverProxy.registerService(beatInfo.getServiceName(),
NamingUtils.getGroupName(beatInfo.getServiceName()), instance);
} catch (Exception ignore) {
}
}
} catch (NacosException ne) {
NAMING_LOGGER.error("[CLIENT-BEAT] failed to send beat: {}, code: {}, msg: {}",
JSON.toJSONString(beatInfo), ne.getErrCode(), ne.getErrMsg());
}
// 周期进行心跳包发送(默认5s)
executorService.schedule(new BeatTask(beatInfo), nextTime, TimeUnit.MILLISECONDS);
}
服务端心跳处理
com.alibaba.nacos.naming.controllers.InstanceController#beat
public JSONObject beat(HttpServletRequest request) throws Exception {
JSONObject result = new JSONObject();
result.put("clientBeatInterval", switchDomain.getClientBeatInterval());
String serviceName = WebUtils.required(request, CommonParams.SERVICE_NAME);
String namespaceId = WebUtils.optional(request, CommonParams.NAMESPACE_ID,
Constants.DEFAULT_NAMESPACE_ID);
String clusterName = WebUtils.optional(request, CommonParams.CLUSTER_NAME,
UtilsAndCommons.DEFAULT_CLUSTER_NAME);
String ip = WebUtils.optional(request, "ip", StringUtils.EMPTY);
int port = Integer.parseInt(WebUtils.optional(request, "port", "0"));
String beat = WebUtils.optional(request, "beat", StringUtils.EMPTY);
RsInfo clientBeat = null;
// beat如果为空,说明是轻量级心跳,这里设计巧妙的原因在于可以节省网络带宽
// 重量级心跳(注册时心跳)则beat不为空
if (StringUtils.isNotBlank(beat)) {
clientBeat = JSON.parseObject(beat, RsInfo.class);
}
if (clientBeat != null) {
if (StringUtils.isNotBlank(clientBeat.getCluster())) {
clusterName = clientBeat.getCluster();
} else {
// fix #2533
clientBeat.setCluster(clusterName);
}
ip = clientBeat.getIp();
port = clientBeat.getPort();
}
if (Loggers.SRV_LOG.isDebugEnabled()) {
Loggers.SRV_LOG.debug("[CLIENT-BEAT] full arguments: beat: {}, serviceName: {}", clientBeat, serviceName);
}
Instance instance = serviceManager.getInstance(namespaceId, serviceName, clusterName, ip, port);
// 获取不到相应的实例服务端会执行实例注册(这里的场景在由于客户端心跳发送是利用延时定时任务执行的,执行顺序会在服务注册之后。若服务注册因为某些原因注册失败,这里会获取不到实例)
if (instance == null) {
// 这里说明客户端发送的是轻量级心跳(这里的场景在服务端因重启过后客户端一直发送的是轻量级心跳,且该服务端的实例也获取不到,直接返回20404的code值)
if (clientBeat == null) {
result.put(CommonParams.CODE, NamingResponseCode.RESOURCE_NOT_FOUND);
return result;
}
instance = new Instance();
instance.setPort(clientBeat.getPort());
instance.setIp(clientBeat.getIp());
instance.setWeight(clientBeat.getWeight());
instance.setMetadata(clientBeat.getMetadata());
instance.setClusterName(clusterName);
instance.setServiceName(serviceName);
instance.setInstanceId(instance.getInstanceId());
instance.setEphemeral(clientBeat.isEphemeral());
serviceManager.registerInstance(namespaceId, serviceName, instance);
}
Service service = serviceManager.getService(namespaceId, serviceName);
if (service == null) {
throw new NacosException(NacosException.SERVER_ERROR,
"service not found: " + serviceName + "@" + namespaceId);
}
if (clientBeat == null) {
clientBeat = new RsInfo();
clientBeat.setIp(ip);
clientBeat.setPort(port);
clientBeat.setCluster(clusterName);
}
// 心跳续约
service.processClientBeat(clientBeat);
result.put(CommonParams.CODE, NamingResponseCode.OK);
result.put("clientBeatInterval", instance.getInstanceHeartBeatInterval());
result.put(SwitchEntry.LIGHT_BEAT_ENABLED, switchDomain.isLightBeatEnabled());
return result;
}
com.alibaba.nacos.naming.core.Service#processClientBeat
public void processClientBeat(final RsInfo rsInfo) {
ClientBeatProcessor clientBeatProcessor = new ClientBeatProcessor();
clientBeatProcessor.setService(this);
clientBeatProcessor.setRsInfo(rsInfo);
HealthCheckReactor.scheduleNow(clientBeatProcessor);
}
这里执行心跳检查用了ScheduledExecutorService的schedule方法进行执行,delay设置为0,也就是立即执行,具体逻辑源码如下
com.alibaba.nacos.naming.healthcheck.ClientBeatProcessor#run
public void run() {
Service service = this.service;
if (Loggers.EVT_LOG.isDebugEnabled()) {
Loggers.EVT_LOG.debug("[CLIENT-BEAT] processing beat: {}", rsInfo.toString());
}
String ip = rsInfo.getIp();
String clusterName = rsInfo.getCluster();
int port = rsInfo.getPort();
// 获取实例所在的集群
Cluster cluster = service.getClusterMap().get(clusterName);
// 获取该集群下的所有实例
List<Instance> instances = cluster.allIPs(true);
// 遍历所有实例
for (Instance instance : instances) {
// 取出对应符合的实例
if (instance.getIp().equals(ip) && instance.getPort() == port) {
if (Loggers.EVT_LOG.isDebugEnabled()) {
Loggers.EVT_LOG.debug("[CLIENT-BEAT] refresh beat: {}", rsInfo.toString());
}
// 设置最后一次心跳时间为当前时间
instance.setLastBeat(System.currentTimeMillis());
if (!instance.isMarked()) {
// 如果之前该实例健康状态为false,则更新为true
if (!instance.isHealthy()) {
instance.setHealthy(true);
Loggers.EVT_LOG.info("service: {} {POS} {IP-ENABLED} valid: {}:{}@{}, region: {}, msg: client beat ok",
cluster.getService().getName(), ip, port, cluster.getName(), UtilsAndCommons.LOCALHOST_SITE);
// 将新的服务数据推送给客户端
getPushService().serviceChanged(service);
}
}
}
}
}
服务端健康检查
服务端的健康源码在服务注册的逻辑里。在服务端收到客户端的注册请求后,只有在获取不到相应service并创建完空的service之后便会开启健康检查的任务,具体的源码路径为
com.alibaba.nacos.naming.core.ServiceManager#putServiceAndInit
private void putServiceAndInit(Service service) throws NacosException {
putService(service);
// 这个方法里会开启健康检查任务
service.init();
consistencyService.listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), true), service);
consistencyService.listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), false), service);
Loggers.SRV_LOG.info("[NEW-SERVICE] {}", service.toJSON());
}
com.alibaba.nacos.naming.core.Service#init
public void init() {
HealthCheckReactor.scheduleCheck(clientBeatCheckTask);
for (Map.Entry<String, Cluster> entry : clusterMap.entrySet()) {
entry.getValue().setService(this);
entry.getValue().init();
}
}
com.alibaba.nacos.naming.healthcheck.HealthCheckReactor#scheduleCheck(com.alibaba.nacos.naming.healthcheck.ClientBeatCheckTask)
public static void scheduleCheck(ClientBeatCheckTask task) {
futureMap.putIfAbsent(task.taskKey(), EXECUTOR.scheduleWithFixedDelay(task, 5000, 5000, TimeUnit.MILLISECONDS));
}
从上图可以看出,健康检查也是利用定时任务进行处理,其核心的类为ClientBeatCheckTask。该类也是实现了Runnable接口,具体实现的源码路径为
com.alibaba.nacos.naming.healthcheck.HealthCheckTask#run
public void run() {
try {
if (!getDistroMapper().responsible(service.getName())) {
return;
}
if (!getSwitchDomain().isHealthCheckEnabled()) {
return;
}
// 获取该服务下所有的实例(默认为所有的临时实例)
List<Instance> instances = service.allIPs(true);
// first set health status of instances:
for (Instance instance : instances) {
// 如果当前时间 - 该实例的最后心跳时间大于心跳检测的超时时间,则将实例的健康状态设置为false,并会发送一个serviceChanged事件,即通知客户端
if (System.currentTimeMillis() - instance.getLastBeat() > instance.getInstanceHeartBeatTimeOut()) {
if (!instance.isMarked()) {
if (instance.isHealthy()) {
instance.setHealthy(false);
Loggers.EVT_LOG.info("{POS} {IP-DISABLED} valid: {}:{}@{}@{}, region: {}, msg: client timeout after {}, last beat: {}",
instance.getIp(), instance.getPort(), instance.getClusterName(), service.getName(),
UtilsAndCommons.LOCALHOST_SITE, instance.getInstanceHeartBeatTimeOut(), instance.getLastBeat());
getPushService().serviceChanged(service);
SpringContext.getAppContext().publishEvent(new InstanceHeartbeatTimeoutEvent(this, instance));
}
}
}
}
if (!getGlobalConfig().isExpireInstance()) {
return;
}
// then remove obsolete instances:
for (Instance instance : instances) {
if (instance.isMarked()) {
continue;
}
// 这里又遍历一次实例,如果当前时间 - 该实例的最后一次心跳时间大于实例删除的时间
if (System.currentTimeMillis() - instance.getLastBeat() > instance.getIpDeleteTimeout()) {
// delete instance
Loggers.SRV_LOG.info("[AUTO-DELETE-IP] service: {}, ip: {}", service.getName(), JSON.toJSONString(instance));
// 删除实例,这里会调用InstanceController的deregister方法
deleteIP(instance);
}
}
} catch (Exception e) {
Loggers.SRV_LOG.warn("Exception while processing client beat time out.", e);
}
}
删除实例会调用InstanceController的deregister方法
com.alibaba.nacos.naming.controllers.InstanceController#deregister
public String deregister(HttpServletRequest request) throws Exception {
Instance instance = getIPAddress(request);
String namespaceId = WebUtils.optional(request, CommonParams.NAMESPACE_ID,
Constants.DEFAULT_NAMESPACE_ID);
String serviceName = WebUtils.required(request, CommonParams.SERVICE_NAME);
Service service = serviceManager.getService(namespaceId, serviceName);
if (service == null) {
Loggers.SRV_LOG.warn("remove instance from non-exist service: {}", serviceName);
return "ok";
}
// 移除实例
serviceManager.removeInstance(namespaceId, serviceName, instance.isEphemeral(), instance);
return "ok";
}
com.alibaba.nacos.naming.core.ServiceManager#removeInstance(java.lang.String, java.lang.String, boolean, com.alibaba.nacos.naming.core.Instance...)
public void removeInstance(String namespaceId, String serviceName, boolean ephemeral, Instance... ips) throws NacosException {
// 获取服务
Service service = getService(namespaceId, serviceName);
synchronized (service) {
// 移除实例
removeInstance(namespaceId, serviceName, ephemeral, service, ips);
}
}
com.alibaba.nacos.naming.core.ServiceManager#removeInstance(java.lang.String, java.lang.String, boolean, com.alibaba.nacos.naming.core.Service, com.alibaba.nacos.naming.core.Instance...)
public void removeInstance(String namespaceId, String serviceName, boolean ephemeral, Service service, Instance... ips) throws NacosException {
// 生成唯一key
String key = KeyBuilder.buildInstanceListKey(namespaceId, serviceName, ephemeral);
// 获取该服务下所有的临时实例,将需要删除的实例去除
List<Instance> instanceList = substractIpAddresses(service, ephemeral, ips);
// 将去除之后的实例列表放入Instances对象中
Instances instances = new Instances();
instances.setInstanceList(instanceList);
// 进行服务注册表的更新、集群之间的同步
consistencyService.put(key, instances);
}
这个方法和服务注册的addInstance几乎一样,只是事件由add事件变成了remove事件。该方法里的substractIpAddresses和consistencyService.put方法都在服务注册的章节中有对应源码的详解,具体可以到服务注册章节查阅。
总结
Nacos注册中心为了感知客户端的服务实例是否存活,设计了心跳机制和健康检查。这样做的目的就是为了能够尽量感知客户端是否存活,减少客户端服务之间相互调用的失败重连次数,最大限度保证服务的高可用性。