参考链接:https://cloud.tencent.com/developer/article/2120188
关键属性
BeatReactor
- Map<String, BeatInfo> dom2Beat:保存客户端心跳信息,key:serviceName + “#” + ip + “#” + port
ServiceManager
- Map<String, Map<String, Service>> serviceMap:保存实例注册信息Map(namespace, Map(group::serviceName, Service))
Service
- ClientBeatCheckTask clientBeatCheckTask:心跳检测任务
- Map<String, Cluster> clusterMap:Cluster Map,Cluster中保存这个cluster下的所有Instance
Cluster
- HealthCheckTask checkTask:给当前Cluster下的所有Instance异步发送心跳信息
- Set persistentInstances:保存持久化Instance
- Set ephemeralInstances:保存临时Instance,Instance注册实际是修改这个Set(将新添加的和需要保留的Instance放到新的Set中,然后用新Set替代旧的ephemeralInstances)
心跳流程
临时实例
- 客户端判断为临时实例时,会添加心跳任务,任务执行间隔固定时间,向服务端的/nacos/v1/ns/instance/beat接口发送心跳信息
- 服务端的心跳接口收到信息后,修改Instance为健康状态,并更新收到心跳的时间(lastBeat)为当前时间
- Service的ClientBeatCheckTask线程轮训检查临时实例的lastBeat的值,如果超过健康时间,修改实例为不健康,如果超过删除时间,调用内部删除的http接口删除实例
持久化实例
Cluster初始化时添加HealthCheckTask任务到线程池,HealthCheckTask会轮训Cluster中的持久化实例并向他们发送请求,通过请求的响应来标记健康状态
客户端
NacosNamingService.registerInstance
客户端建立心跳任务并向服务端发起注册请求
public void registerInstance(String serviceName, String groupName, Instance instance) throws NacosException {
NamingUtils.checkInstanceIsLegal(instance);
String groupedServiceName = NamingUtils.getGroupedName(serviceName, groupName);
if (instance.isEphemeral()) {
BeatInfo beatInfo = beatReactor.buildBeatInfo(groupedServiceName, instance);
beatReactor.addBeatInfo(groupedServiceName, beatInfo);
}
serverProxy.registerService(groupedServiceName, groupName, instance);
}
BeatReactor.buildBeatInfo
建立新的心跳信息
public BeatInfo buildBeatInfo(String groupedServiceName, Instance instance){
BeatInfo beatInfo = new BeatInfo();
beatInfo.setServiceName(groupedServiceName);
beatInfo.setIp(instance.getIp());
beatInfo.setPort(instance.getPort());
beatInfo.setCluster(instance.getClusterName());
beatInfo.setWeight(instance.getWeight());
beatInfo.setMetadata(instance.getMetadata());
beatInfo.setScheduled(false);
beatInfo.setPeriod(instance.getInstanceHeartBeatInterval());
return beatInfo;
}
BeatReactor.addBeatInfo
添加心跳任务
public void addBeatInfo(String serviceName, BeatInfo beatInfo) {
NAMING_LOGGER.info("[BEAT] adding beat: {} to beat map.", beatInfo);
// 建立key:serviceName + "#" + ip + "#" + port
String key = buildKey(serviceName, beatInfo.getIp(), beatInfo.getPort());
BeatInfo existBeat = null;
// 如果存在相同的心跳任务,暂停之前的心跳任务
if ((existBeat = dom2Beat.put(key, beatInfo)) != null) {
existBeat.setStopped(true);
}
// 添加心跳任务
executorService.schedule(new BeatTask(beatInfo), beatInfo.getPeriod(), TimeUnit.MILLISECONDS);
MetricsMonitor.getDom2BeatSizeMonitor().set(dom2Beat.size());
}
BeatTask.run
心跳任务的执行,当前执行完成后,通过finally再此将任务加入到线程池中,来实现向服务端定时发送心跳信息
向服务端发送心跳信息后,如果服务端不存在此Instance,则再次调用注册方法
public void run() {
// 如果心跳任务暂停了,就不再继续执行
if (beatInfo.isStopped()) {
return;
}
long nextTime = beatInfo.getPeriod();
try {
// 向服务端发送心跳信息,url为/nacos/v1/ns/instance/beat
JsonNode result = serverProxy.sendBeat(beatInfo, BeatReactor.this.lightBeatEnabled);
long interval = result.get("clientBeatInterval").asLong();
boolean lightBeatEnabled = false;
if (result.has(CommonParams.LIGHT_BEAT_ENABLED)) {
lightBeatEnabled = result.get(CommonParams.LIGHT_BEAT_ENABLED).asBoolean();
}
BeatReactor.this.lightBeatEnabled = lightBeatEnabled;
if (interval > 0) {
nextTime = interval;
}
int code = NamingResponseCode.OK;
if (result.has(CommonParams.CODE)) {
code = result.get(CommonParams.CODE).asInt();
}
// 如果心跳对应的Instance在服务端不存在,调用注册接口重新向服务端发起注册
if (code == NamingResponseCode.RESOURCE_NOT_FOUND) {
Instance instance = new Instance();
instance.setPort(beatInfo.getPort());
instance.setIp(beatInfo.getIp());
instance.setWeight(beatInfo.getWeight());
instance.setMetadata(beatInfo.getMetadata());
instance.setClusterName(beatInfo.getCluster());
instance.setServiceName(beatInfo.getServiceName());
instance.setInstanceId(instance.getInstanceId());
instance.setEphemeral(true);
try {
serverProxy.registerService(beatInfo.getServiceName(),
NamingUtils.getGroupName(beatInfo.getServiceName()), instance);
} catch (Exception ignore) {
}
}
} catch (NacosException ex) {
NAMING_LOGGER.warn("[CLIENT-BEAT] failed to send beat: {}, code: {}, msg: {}",
JacksonUtils.toJson(beatInfo), ex.getErrCode(), ex.getErrMsg());
} catch (Exception unknownEx) {
NAMING_LOGGER.error("[CLIENT-BEAT] failed to send beat: {}, unknown exception msg: {}",
JacksonUtils.toJson(beatInfo), unknownEx.getMessage(), unknownEx);
} finally {
// 向线程池中加入下次的心跳任务,如果服务端有返回心跳间隔,怎使用返回的间隔,否则使用心跳任务的间隔
executorService.schedule(new BeatTask(beatInfo), nextTime, TimeUnit.MILLISECONDS);
}
}
服务端
InstanceController.beat
提供给客户端的心跳接口,处理来自客户端的心跳请求
1.从请求中获取Instance的相关信息namespaceId,serviceName,clusterName,ip,port
2.如果Instance不存在且心跳信息也不存在,返回RESOURCE_NOT_FOUND异常,客户端收到这个异常后会重新向服务端发起注册请求
3.Instance不存在但心跳信息存在,调用注册方法注册Instance
4.判断Service是否存在,不存在抛出异常,存在则调用处理心跳信息的方法
@CanDistro
@PutMapping("/beat")
@Secured(parser = NamingResourceParser.class, action = ActionTypes.WRITE)
public ObjectNode beat(HttpServletRequest request) throws Exception {
ObjectNode result = JacksonUtils.createEmptyJsonNode();
result.put(SwitchEntry.CLIENT_BEAT_INTERVAL, switchDomain.getClientBeatInterval());
String beat = WebUtils.optional(request, "beat", StringUtils.EMPTY);
RsInfo clientBeat = null;
if (StringUtils.isNotBlank(beat)) {
clientBeat = JacksonUtils.toObj(beat, RsInfo.class);
}
// 获取心跳请求中的namespaceId,serviceName,clusterName,ip,port
String clusterName = WebUtils
.optional(request, CommonParams.CLUSTER_NAME, UtilsAndCommons.DEFAULT_CLUSTER_NAME);
String ip = WebUtils.optional(request, "ip", StringUtils.EMPTY);
int port = Integer.parseInt(WebUtils.optional(request, "port", "0"));
if (clientBeat != null) {
if (StringUtils.isNotBlank(clientBeat.getCluster())) {
clusterName = clientBeat.getCluster();
} else {
// fix #2533
clientBeat.setCluster(clusterName);
}
ip = clientBeat.getIp();
port = clientBeat.getPort();
}
String namespaceId = WebUtils.optional(request, CommonParams.NAMESPACE_ID, Constants.DEFAULT_NAMESPACE_ID);
String serviceName = WebUtils.required(request, CommonParams.SERVICE_NAME);
NamingUtils.checkServiceNameFormat(serviceName);
// 获取对应的Instance信息
Instance instance = serviceManager.getInstance(namespaceId, serviceName, clusterName, ip, port);
if (instance == null) {
// 客户端收到响应后,判断是下面这个异常时,会重新向服务端发起注册请求
if (clientBeat == null) {
result.put(CommonParams.CODE, NamingResponseCode.RESOURCE_NOT_FOUND);
return result;
}
// 服务端中不存在这个Instance,重新注册
instance = new Instance();
instance.setPort(clientBeat.getPort());
instance.setIp(clientBeat.getIp());
instance.setWeight(clientBeat.getWeight());
instance.setMetadata(clientBeat.getMetadata());
instance.setClusterName(clusterName);
instance.setServiceName(serviceName);
instance.setInstanceId(instance.getInstanceId());
instance.setEphemeral(clientBeat.isEphemeral());
serviceManager.registerInstance(namespaceId, serviceName, instance);
}
// 判断服务是否存在
Service service = serviceManager.getService(namespaceId, serviceName);
if (service == null) {
throw new NacosException(NacosException.SERVER_ERROR,
"service not found: " + serviceName + "@" + namespaceId);
}
// 如果请求中不包含心跳信息,构建心跳信息
if (clientBeat == null) {
clientBeat = new RsInfo();
clientBeat.setIp(ip);
clientBeat.setPort(port);
clientBeat.setCluster(clusterName);
}
// 处理心跳请求
service.processClientBeat(clientBeat);
result.put(CommonParams.CODE, NamingResponseCode.OK);
if (instance.containsMetadata(PreservedMetadataKeys.HEART_BEAT_INTERVAL)) {
result.put(SwitchEntry.CLIENT_BEAT_INTERVAL, instance.getInstanceHeartBeatInterval());
}
result.put(SwitchEntry.LIGHT_BEAT_ENABLED, switchDomain.isLightBeatEnabled());
return result;
}
Service.processClientBeat
// 添加心跳信息处理的任务到线程池
public void processClientBeat(final RsInfo rsInfo) {
ClientBeatProcessor clientBeatProcessor = new ClientBeatProcessor();
clientBeatProcessor.setService(this);
clientBeatProcessor.setRsInfo(rsInfo);
HealthCheckReactor.scheduleNow(clientBeatProcessor);
}
ClientBeatProcessor.run
心跳信息处理,修改Cluster中对应的Instance.lastBeat的值为当前系统时间,如果之前Instance.healthy为false且Instance.marked为false,修改Instance.healthy为true且发布Instance改变的事件
public void run() {
Service service = this.service;
String ip = rsInfo.getIp();
String clusterName = rsInfo.getCluster();
int port = rsInfo.getPort();
Cluster cluster = service.getClusterMap().get(clusterName);
List<Instance> instances = cluster.allIPs(true);
for (Instance instance : instances) {
if (instance.getIp().equals(ip) && instance.getPort() == port) {
instance.setLastBeat(System.currentTimeMillis());
// 修改Instance健康状态并发布事件
if (!instance.isMarked() && !instance.isHealthy()) {
instance.setHealthy(true);
getPushService().serviceChanged(service);
}
}
}
}
ServiceManager.putServiceAndInit
将Service保存到serviceMap中,并将Service初始化,初始化时会新建心跳的定时任务
private void putServiceAndInit(Service service) throws NacosException {
putService(service);
service = getService(service.getNamespaceId(), service.getName());
service.init();
consistencyService
.listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), true), service);
consistencyService
.listen(KeyBuilder.buildInstanceListKey(service.getNamespaceId(), service.getName(), false), service);
Loggers.SRV_LOG.info("[NEW-SERVICE] {}", service.toJson());
}
// 启动服务的心跳检测
public void init() {
HealthCheckReactor.scheduleCheck(clientBeatCheckTask);
for (Map.Entry<String, Cluster> entry : clusterMap.entrySet()) {
entry.getValue().setService(this);
entry.getValue().init();
}
}
Service.init()
添加当前Service对应的心跳检测任务到线程池,5S一次,同时调用Cluster的init方法
public void init() {
HealthCheckReactor.scheduleCheck(clientBeatCheckTask);
for (Map.Entry<String, Cluster> entry : clusterMap.entrySet()) {
entry.getValue().setService(this);
entry.getValue().init();
}
}
ClientBeatCheckTask.run
public void run() {
try {
// 判断Service是否由当前服务端进行检测
if (!getDistroMapper().responsible(service.getName())) {
return;
}
if (!getSwitchDomain().isHealthCheckEnabled()) {
return;
}
// 获取当前Service下所有的临时的Instance
List<Instance> instances = service.allIPs(true);
// first set health status of instances:
for (Instance instance : instances) {
// 如果上次收到心跳的时间到现在超过了预定的时间,设置Instance的状态为false,并发送ServiceChangeEvent和InstanceHeartbeatTimeoutEvent事件
if (System.currentTimeMillis() - instance.getLastBeat() > instance.getInstanceHeartBeatTimeOut()) {
if (!instance.isMarked()) {
if (instance.isHealthy()) {
instance.setHealthy(false);
getPushService().serviceChanged(service);
ApplicationUtils.publishEvent(new InstanceHeartbeatTimeoutEvent(this, instance));
}
}
}
}
// getGlobalConfig().isExpireInstance()的值默认为true
if (!getGlobalConfig().isExpireInstance()) {
return;
}
for (Instance instance : instances) {
if (instance.isMarked()) {
continue;
}
// 心跳超时,使用http异步请求/v1/ns/instance?来删除Instance
if (System.currentTimeMillis() - instance.getLastBeat() > instance.getIpDeleteTimeout()) {
deleteIp(instance);
}
}
} catch (Exception e) {
Loggers.SRV_LOG.warn("Exception while processing client beat time out.", e);
}
}
Cluster.init
初始化Cluster,通过volatile修饰的属性inited保证只初始化一次,添加健康检测任务到线程池中
public synchronized void init() {
if (inited) {
return;
}
checkTask = new HealthCheckTask(this);
HealthCheckReactor.scheduleCheck(checkTask);
inited = true;
}
HealthCheckTask.run
public void run() {
try {
// 检查Cluster是否能执行心跳任务
if (distroMapper.responsible(cluster.getService().getName()) && switchDomain
.isHealthCheckEnabled(cluster.getService().getName())) {
// 持久性实例心跳检测,会通过向Instance的端口发送请求,通过响应来修改Instance的health的值
// List<Instance> ips = task.getCluster().allIPs(false);
healthCheckProcessor.process(this);
}
} catch (Throwable e) {
// 这里省略了日志输出的代码
} finally {
// 重新将当前任务加入线程池中来达到循环执行的目的
if (!cancelled) {
HealthCheckReactor.scheduleCheck(this);
// 这里省略了日志输出的代码
}
}
}