1.配置
"cluster_manager": {
"clusters": [
{
"name": "serverCluster",
"type": "SIMPLE",
"lb_type": "LB_RANDOM",
"max_request_per_conn": 1024,
"conn_buffer_limit_bytes": 32768,
"hosts": [
{"address":"127.0.0.1:8080"}
]
},
{
"name": "clientCluster",
"health_check": {
"protocol":"Http1",
"timeout":"10s",
"interval":"1m",
"interval_jitter":"1m",
"healthy_threshold":1,
"service_name":"test"
},
"type": "SIMPLE",
"lb_type": "LB_RANDOM",
"max_request_per_conn": 1024,
"conn_buffer_limit_bytes": 32768,
"hosts": [
{"address":"127.0.0.1:8080"}
]
}
]
}
2.源码
1.启动后,会去初始化cluster
2.初始化时,根据配置的heath_check字段,给cluster设置healthChecker.
3.初始化后的一个步骤,就是updateHost.这个里面就去执行healthChecker.
- 3.1主要就是对每个host,启动一个协程去检测端口是否通.
- 3.2如果不通,设置host为不健康,接着继续健康检查;通的话,设置为health
// simpleCluster is an implementation of types.Cluster type simpleCluster struct { info *clusterInfo mutex sync.Mutex healthChecker types.HealthChecker lbInstance types.LoadBalancer // load balancer used for this cluster hostSet *hostSet snapshot atomic.Value }
func newSimpleCluster(clusterConfig v2.Cluster) types.Cluster { // TODO support original dst cluster if clusterConfig.ClusterType == v2.ORIGINALDST_CLUSTER { clusterConfig.LbType = v2.LB_ORIGINAL_DST } info := &clusterInfo{ name: clusterConfig.Name, clusterType: clusterConfig.ClusterType, maxRequestsPerConn: clusterConfig.MaxRequestPerConn, connBufferLimitBytes: clusterConfig.ConnBufferLimitBytes, stats: newClusterStats(clusterConfig.Name), lbSubsetInfo: NewLBSubsetInfo(&clusterConfig.LBSubSetConfig), // new subset load balancer info lbOriDstInfo: NewLBOriDstInfo(&clusterConfig.LBOriDstConfig), // new oridst load balancer info lbType: types.LoadBalancerType(clusterConfig.LbType), resourceManager: NewResourceManager(clusterConfig.CirBreThresholds), } // set ConnectTimeout if clusterConfig.ConnectTimeout != nil { info.connectTimeout = clusterConfig.ConnectTimeout.Duration } else { info.connectTimeout = network.DefaultConnectTimeout } // tls mng mgr, err := mtls.NewTLSClientContextManager(&clusterConfig.TLS) if err != nil { log.DefaultLogger.Alertf("cluster.config", "[upstream] [cluster] [new cluster] create tls context manager failed, %v", err) } info.tlsMng = mgr cluster := &simpleCluster{ info: info, } // init a empty hostSet := &hostSet{} cluster.snapshot.Store(&clusterSnapshot{ info: info, hostSet: hostSet, lb: NewLoadBalancer(info, hostSet), }) if clusterConfig.HealthCheck.ServiceName != "" { log.DefaultLogger.Infof("[upstream] [cluster] [new cluster] cluster %s have health check", clusterConfig.Name) cluster.healthChecker = healthcheck.CreateHealthCheck(clusterConfig.HealthCheck) } return cluster }
func (sc *simpleCluster) UpdateHosts(newHosts []types.Host) { //略略略 if sc.healthChecker != nil { sc.healthChecker.SetHealthCheckerHostSet(hostSet) } }
// only called in cluster, lock in cluster // SetHealthCheckerHostSet reset the healthchecker's hosts func (hc *healthChecker) SetHealthCheckerHostSet(hostSet types.HostSet) { hc.stop() hc.hosts = hostSet.Hosts() hc.start() }
//对每一个host进行健康检查 func (hc *healthChecker) startCheck(host types.Host) { addr := host.AddressString() if _, ok := hc.checkers[addr]; !ok { //创建一个session对象,代表一次健康检查的会话. s := hc.sessionFactory.NewSession(hc.sessionConfig, host) if s == nil { log.DefaultLogger.Alertf("healthcheck.session", "[upstream] [health check] Create Health Check Session Error, Remote Address = %s", addr) return } c := newChecker(s, host, hc) hc.checkers[addr] = c utils.GoWithRecover(func() { //=开启一个协程,去做健康检查= c.Start() }, nil) atomic.AddInt64(&hc.localProcessHealthy, 1) // default host is healthy } }
下面是关键逻辑了
/** 健康检查.每一个cluster会对应一个sessionChecker.执行OnCheck方法 */ func (c *sessionChecker) Start() { //1.创建一个定时器.定时调用check函数 c.checkTimer = utils.NewTimer(firstInterval, c.OnCheck) for { select { case <-c.stop: return default: // prepare a check currentID := atomic.AddUint64(&c.checkID, 1) select { case <-c.stop: return //====健康检查结果==== case resp := <-c.resp: // if the ID is not equal, means we receive a timeout for this ID, ignore the response if resp.ID == currentID { c.checkTimeout.Stop() //存活的话,就把节点设置为healthy if resp.Healthy { c.HandleSuccess() } else { //否则就shutdown c.HandleFailure(types.FailureActive) } // next health checker c.checkTimer = utils.NewTimer(c.HealthChecker.getCheckInterval(), c.OnCheck) if log.DefaultLogger.GetLogLevel() >= log.DEBUG { log.DefaultLogger.Debugf("[upstream] [health check] [session checker] receive a response id: %d", resp.ID) } } else { if log.DefaultLogger.GetLogLevel() >= log.DEBUG { log.DefaultLogger.Debugf("[upstream] [health check] [session checker] receive a expired id response, response id: %d, currentID: %d", resp.ID, currentID) } } case <-c.timeout: //超时后,进行失败处理 c.checkTimer.Stop() c.Session.OnTimeout() // session timeout callbacks c.HandleFailure(types.FailureNetwork) // next health checker //重新开始健康检查 c.checkTimer = utils.NewTimer(c.HealthChecker.getCheckInterval(), c.OnCheck) if log.DefaultLogger.GetLogLevel() >= log.DEBUG { log.DefaultLogger.Debugf("[upstream] [health check] [session checker] receive a timeout response at id: %d", currentID) } } } } }/** 健康检查 */ func (c *sessionChecker) OnCheck() { // record current id id := atomic.LoadUint64(&c.checkID) c.HealthChecker.stats.attempt.Inc(1) // start a timeout before check health c.checkTimeout.Stop() //设置超时定时器.超时后,就会往timeout这个channel放个true c.checkTimeout = utils.NewTimer(c.HealthChecker.timeout, c.OnTimeout) c.resp <- checkResponse{ ID: id, Healthy: c.Session.CheckHealth(), } }//实际监看检查的逻辑 func (s *TCPDialSession) CheckHealth() bool { // default dial timeout, maybe already timeout by checker conn, err := net.DialTimeout("tcp", s.addr, 30*time.Second) if err != nil { log.DefaultLogger.Infof("[upstream] [health check] [tcpdial session] dial tcp for host %s error: %v", s.addr, err) return false } conn.Close() return true }
注意,支持的健康检查类型
var ProtocolsSupported = map[string]bool{ string(protocol.Auto): true, string(protocol.HTTP1): true, string(protocol.HTTP2): true, string(protocol.Xprotocol): true, }