细谈Calico探针检查
Calico为什么NotReady/重启了
2022年了,大量互联网企业使用kubernetes做服务发布等需求,网络组件大多使用overlay/underlay模式,采用Calico组网的模式越来越多,但是Calico-node为什么Notready,为什么0/1状态,傻傻分不清
Calico探针检测方式
在Kubernetes自带的检测方式中,有Readiness跟Liveness探针维护组件的健康状态,如果有异常会进行重启操作。
Liveness
Liveness负责维护pod是否处于正常,如果根据配置的参数检测后,返回异常后,会对pod进行重启操作。
Calico组件Liveness是怎么做的呢
检查bird状态是否正常:calico-node -bird-live
检查felix状态是否正常:calico-node -felix-live
具体干了什么呢
health检查代码入口位置 node/pkg/health/health.go
func Run(bird, bird6, felixReady, felixLive, birdLive, bird6Live bool, thresholdTime time.Duration) {
livenessChecks := felixLive || birdLive || bird6Live
readinessChecks := bird || felixReady || bird6
if !livenessChecks && !readinessChecks {
fmt.Printf("calico/node check error: must specify at least one of -bird-live, -bird6-live, -felix-live, -bird, -bird6, or -felix\n")
os.Exit(1)
}
ctx, cancel := context.WithTimeout(context.Background(), thresholdTime)
defer cancel()
g, ctx := errgroup.WithContext(ctx)
if felixLive {
g.Go(func() error {
if err := checkFelixHealth(ctx, felixLivenessEp, "liveness"); err != nil {
return fmt.Errorf("calico/node is not ready: Felix is not live: %+v", err)
}
return nil
})
}
if birdLive {
g.Go(func() error {
if err := checkServiceIsLive([]string{"confd", "bird"}); err != nil {
return fmt.Errorf("calico/node is not ready: bird/confd is not live: %+v", err)
}
return nil
})
}
if bird6Live {
g.Go(func() error {
if err := checkServiceIsLive([]string{"confd", "bird6"}); err != nil {
return fmt.Errorf("calico/node is not ready: bird6/confd is not live: %+v", err)
}
return nil
})
}
if felixReady {
g.Go(func() error {
if err := checkFelixHealth(ctx, felixReadinessEp, "readiness"); err != nil {
return fmt.Errorf("calico/node is not ready: felix is not ready: %+v", err)
}
return nil
})
}
if bird {
g.Go(func() error {
if err := checkBIRDReady("4", thresholdTime); err != nil {
return fmt.Errorf("calico/node is not ready: BIRD is not ready: %+v", err)
}
return nil
})
}
if bird6 {
g.Go(func() error {
if err := checkBIRDReady("6", thresholdTime); err != nil {
return fmt.Errorf("calico/node is not ready: BIRD6 is not ready: %+v", err)
}
return nil
})
}
if err := g.Wait(); err != nil {
fmt.Printf("%s\n", err)
os.Exit(1)
}
}
检查bird做了什么
bird分为bird和bird6进程,bird进程负责ipv4bgp邻居状态建立等内容;bird6进程负责ipv6bgp邻居状态建立;
func checkService(serviceName string) error {
out, err := exec.Command("sv", "status", fmt.Sprintf("/etc/service/enabled/%s", serviceName)).Output()
if err != nil {
return err
}
var cmdOutput = string(out)
if !strings.HasPrefix(cmdOutput, "run") {
return fmt.Errorf(fmt.Sprintf("Service %s is not running. Output << %s >>", serviceName, strings.Trim(cmdOutput, "\n")))
}
return nil
}
检查bird的时候会采用sv status confd/bird去检查容器内进程状态(等价于ps -ef |grep confd/bird)
检查bird6的时候会采用sv status confd/bird6去检查容器内进程状态(等价于ps -ef |grep confd/bird6)
检查felix做了什么
func checkFelixHealth(ctx context.Context, endpoint, probeType string) error {
c := &http.Client{}
req, err := http.NewRequest(http.MethodGet, endpoint, nil)
req = req.WithContext(ctx)
if err != nil {
return err
}
resp, err := c.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 400 {
return fmt.Errorf("%s probe reporting %d", probeType, resp.StatusCode)
}
return nil
}
Felix提供了个以 “http://” + felixHost + “:” + felixPort + "/liveness"的一个health接口,通过这个接口判断felix的liveness检查是否正常。
但是这个接口提供背后做了什么?这是一个持续更新的检查
在felix/daemon/daemon.go
configRetry:
for {
if numClientsCreated > 60 {
// If we're in a restart loop, periodically exit (so we can be restarted) since
// - it may solve the problem if there's something wrong with our process
// - it prevents us from leaking connections to the datastore.
exitWithCustomRC(configChangedRC, "Restarting to avoid leaking datastore connections")
}
// Make an initial report that says we're live but not yet ready.
healthAggregator.Report(healthName, &health.HealthReport{Live: true, Ready: false})
// Load locally-defined config, including the datastore connection
// parameters. First the environment variables.
configParams = config.New()
envConfig := config.LoadConfigFromEnvironment(os.Environ())
// Then, the config file.
log.Infof("Loading config file: %v", configFile)
fileConfig, err := config.LoadConfigFile(configFile)
在此循环内尝试检测并且调用handler的函数去定义http服务返回值是否正常
- 循环内初始health值为
healthAggregator.Report(healthName, &health.HealthReport{Live: true, Ready: false})
- 在循环内逻辑链接apiserver
datastoreConfig = configParams.DatastoreConfig()
- 第一个检测逻辑
for {
globalConfig, hostConfig, err := loadConfigFromDatastore(
ctx, backendClient, datastoreConfig, configParams.FelixHostname)
if err == ErrNotReady {
log.Warn("Waiting for datastore to be initialized (or migrated)")
time.Sleep(1 * time.Second)
healthAggregator.Report(healthName, &health.HealthReport{Live: true, Ready: true})
continue
} else if err != nil {
log.WithError(err).Error("Failed to get config from datastore")
time.Sleep(1 * time.Second)
continue configRetry
}
判断是否可以链接到集群datastore(APISERVER),如果链接正常,就返回健康检测true;如果有错误重新进入最开始的循环configRetry。
- 第二个到最后检测逻辑同上
Readiness
Readniess负责维护pod是否处于Ready状态,如果根据配置的参数检测后,返回异常后,会对pod状态改变成NotReady并且状态处于0/1状态。
Calico组件Readiness是怎么做的呢
检查bird状态是否正常:calico-node -bird-read
检查felix状态是否正常:calico-node -felix-read
具体干了什么呢
bird4/6
func checkBIRDReady(ipv string, thresholdTime time.Duration) error {
// Stat nodename file to get the modified time of the file.
nodenameFileStat, err := os.Stat("/var/lib/calico/nodename")
if err != nil {
return fmt.Errorf("Failed to stat() nodename file: %v", err)
}
// Check for unestablished peers
peers, err := bird.GetPeers(ipv)
log.Debugf("peers: %v", peers)
if err != nil {
return err
}
s := []string{}
// numEstablishedPeer keeps count of number of peers with bgp state established.
numEstablishedPeer := 0
for _, peer := range peers {
if peer.BGPState == "Established" {
numEstablishedPeer += 1
} else {
s = append(s, peer.PeerIP)
}
}
log.Infof("Number of node(s) with BGP peering established = %v", numEstablishedPeer)
if len(peers) == 0 {
// In case of no BGP peers return bird to be ready.
log.Debugf("There are no bgp peers, returning ready.")
} else if time.Since(nodenameFileStat.ModTime()) < thresholdTime {
if len(s) > 0 {
// When we first start up, only report ready if all our peerings are established.
// This prevents rolling update from proceeding until BGP is back up.
return fmt.Errorf("BGP not established with %+v", strings.Join(s, ","))
}
// Check for GR
gr, err := bird.GRInProgress(ipv)
if err != nil {
return err
} else if gr {
return errors.New("graceful restart in progress")
}
} else if numEstablishedPeer > 0 {
// After a while, only require a single peering to be up. This prevents the whole mesh
// from reporting not-ready if some nodes go down.
log.Debugf("There exist(s) %v calico node(s) with BGP peering established.", numEstablishedPeer)
} else {
return fmt.Errorf("BGP not established with %+v", strings.Join(s, ","))
}
return nil
}
- 检查/var/lib/calico/nodename是否存在
- 通过使用/var/run/calico/bird(6).ctl sock文件建立unix链接,通过查询“show protocols”,看返回值是否有消息判断bird状态,查看状态是否是Established,有异常就notready.
Felix
通过查询 “http://” + felixHost + “:” + felixPort + "/readiness"返回值
if reporter.HasReadinessProblem() {
log.WithField("name", reporter.name).Warn("Reporter is not ready.")
summary.Ready = false
}
检测Ready存在问题,pod处于notready状态,原理同liveness。