querycoord启动源码分析
结构体
// Server is the grpc server of QueryCoord.
type Server struct {
wg sync.WaitGroup
loopCtx context.Context
loopCancel context.CancelFunc
grpcServer *grpc.Server
serverID atomic.Int64
grpcErrChan chan error
// 是一个接口类型
queryCoord types.QueryCoordComponent
factory dependency.Factory
etcdCli *clientv3.Client
tikvCli *txnkv.Client
dataCoord types.DataCoordClient
rootCoord types.RootCoordClient
}
分析变量dataCoord、rootCoord是何时赋予的值。
queryCoord是一个接口,实现queryCoord api功能。
func (mr *MilvusRoles) runQueryCoord(ctx context.Context, localMsg bool, wg *sync.WaitGroup) component {
wg.Add(1)
return runComponent(ctx, localMsg, wg, components.NewQueryCoord, metrics.RegisterQueryCoord)
}
// creator用NewQueryCoord替换
role, err = creator(ctx, factory)
components.NewQueryCoord是一个函数。
NewQueryCoord()用来创建QueryCoord结构体。
// NewQueryCoord creates a new QueryCoord
func NewQueryCoord(ctx context.Context, factory dependency.Factory) (*QueryCoord, error) {
svr, err := grpcquerycoord.NewServer(ctx, factory)
if err != nil {
return nil, err
}
return &QueryCoord{
ctx: ctx,
svr: svr,
}, nil
}
grpcquerycoord.NewServer()产生的是本结构体Server。
进入NewServer:
// NewServer create a new QueryCoord grpc server.
func NewServer(ctx context.Context, factory dependency.Factory) (*Server, error) {
ctx1, cancel := context.WithCancel(ctx)
svr, err := qc.NewQueryCoord(ctx1)
if err != nil {
cancel()
return nil, err
}
return &Server{
queryCoord: svr,
loopCtx: ctx1,
loopCancel: cancel,
factory: factory,
grpcErrChan: make(chan error),
}, nil
}
qc.NewQueryCoord()返回一个结构体,是types.QueryCoordComponent接口的实现。
执行Run()
Server结构体创建后,调用结构体的Run()方法。
func runComponent[T component](ctx context.Context,
localMsg bool,
runWg *sync.WaitGroup,
creator func(context.Context, dependency.Factory) (T, error),
metricRegister func(*prometheus.Registry),
) component {
var role T
sign := make(chan struct{})
go func() {
factory := dependency.NewFactory(localMsg)
var err error
role, err = creator(ctx, factory)
if localMsg {
paramtable.SetRole(typeutil.StandaloneRole)
} else {
paramtable.SetRole(role.GetName())
}
if err != nil {
panic(err)
}
close(sign)
// 在这里调用对应组件结构体的Run()方法,这里是QueryCoord结构体
if err := role.Run(); err != nil {
panic(err)
}
runWg.Done()
}()
......
}
runComponent是一个包裹函数。
// Run starts service
func (qs *QueryCoord) Run() error {
if err := qs.svr.Run(); err != nil {
log.Error("QueryCoord starts error", zap.Error(err))
return err
}
log.Debug("QueryCoord successfully started")
return nil
}
Run()方法调用qs.svr.Run()方法。srv是qc.NewQueryCoord()返回的结构体。
// Run initializes and starts QueryCoord's grpc service.
func (s *Server) Run() error {
if err := s.init(); err != nil {
return err
}
log.Debug("QueryCoord init done ...")
if err := s.start(); err != nil {
return err
}
log.Debug("QueryCoord start done ...")
return nil
}
接下来分析s.init()和s.start()方法。
s.init()
// init initializes QueryCoord's grpc service.
func (s *Server) init() error {
params := paramtable.Get()
etcdConfig := ¶ms.EtcdCfg
rpcParams := ¶ms.QueryCoordGrpcServerCfg
etcdCli, err := etcd.GetEtcdClient(
etcdConfig.UseEmbedEtcd.GetAsBool(),
etcdConfig.EtcdUseSSL.GetAsBool(),
etcdConfig.Endpoints.GetAsStrings(),
etcdConfig.EtcdTLSCert.GetValue(),
etcdConfig.EtcdTLSKey.GetValue(),
etcdConfig.EtcdTLSCACert.GetValue(),
etcdConfig.EtcdTLSMinVersion.GetValue())
if err != nil {
log.Debug("QueryCoord connect to etcd failed", zap.Error(err))
return err
}
s.etcdCli = etcdCli
s.SetEtcdClient(etcdCli)
s.queryCoord.SetAddress(rpcParams.GetAddress())
if params.MetaStoreCfg.MetaStoreType.GetValue() == util.MetaStoreTypeTiKV {
......
}
s.wg.Add(1)
// 启动grpc,默认为19531
go s.startGrpcLoop(rpcParams.Port.GetAsInt())
// wait for grpc server loop start
err = <-s.grpcErrChan
if err != nil {
return err
}
// --- Master Server Client ---
// 创建rootCoord客户端
if s.rootCoord == nil {
s.rootCoord, err = rcc.NewClient(s.loopCtx, qc.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
if err != nil {
log.Error("QueryCoord try to new RootCoord client failed", zap.Error(err))
panic(err)
}
}
// wait for master init or healthy
// 等待rootcoord服务正常
log.Debug("QueryCoord try to wait for RootCoord ready")
err = componentutil.WaitForComponentHealthy(s.loopCtx, s.rootCoord, "RootCoord", 1000000, time.Millisecond*200)
if err != nil {
log.Error("QueryCoord wait for RootCoord ready failed", zap.Error(err))
panic(err)
}
if err := s.SetRootCoord(s.rootCoord); err != nil {
panic(err)
}
log.Debug("QueryCoord report RootCoord ready")
// --- Data service client ---
// 创建dataCoord客户端
if s.dataCoord == nil {
s.dataCoord, err = dcc.NewClient(s.loopCtx, qc.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
if err != nil {
log.Error("QueryCoord try to new DataCoord client failed", zap.Error(err))
panic(err)
}
}
// 等待datacoord服务正常
log.Debug("QueryCoord try to wait for DataCoord ready")
err = componentutil.WaitForComponentHealthy(s.loopCtx, s.dataCoord, "DataCoord", 1000000, time.Millisecond*200)
if err != nil {
log.Error("QueryCoord wait for DataCoord ready failed", zap.Error(err))
panic(err)
}
if err := s.SetDataCoord(s.dataCoord); err != nil {
panic(err)
}
log.Debug("QueryCoord report DataCoord ready")
// 执行真正的初始化
if err := s.queryCoord.Init(); err != nil {
return err
}
return nil
}
这段可以看出来,创建了etcdCli并赋予给了s.etcdCli。
s.startGrpcLoop()启动grpc端口服务。
最终调用s.queryCoord.Init()进行初始化,代码位置:internal\querycoordv2\server.go
s.queryCoord是接口类型types.QueryCoordComponent ,QueryCoordComponent 继承于Component。
// QueryCoord is the interface `querycoord` package implements
type QueryCoord interface {
Component
querypb.QueryCoordServer
}
// Component is the interface all services implement
type Component interface {
Init() error
Start() error
Stop() error
Register() error
}
接口套接口:
RootCoordComponent -> RootCoord -> Component
DataCoordComponent -> DataCoord -> Component
QueryCoordComponent -> QueryCoord -> Component
ProxyComponent -> Proxy -> Component
QueryNodeComponent -> QueryNode -> Component
IndexNodeComponent -> IndexNode -> Component
DataNodeComponent -> DataNode -> Component
各组件最终的Init()初始化代码路径:
internal\rootcoord\root_coord.go->Init()
internal\datacoord\server.go->Init()
internal\querycoordv2\server.go->Init()
internal\datanode\data_node.go->Init()
internal\indexnode\indexnode.go->Init()
internal\querynodev2\server.go->Init()
internal\proxy\proxy.go->Init()
回过头来继续querycoord的init。
func (s *Server) Init() error {
log.Info("QueryCoord start init",
zap.String("meta-root-path", Params.EtcdCfg.MetaRootPath.GetValue()),
zap.String("address", s.address))
if err := s.initSession(); err != nil {
return err
}
if s.enableActiveStandBy {
......
}
// 真正执行初始化
return s.initQueryCoord()
}
继续进入c.initQueryCoord():
func (s *Server) initQueryCoord() error {
s.UpdateStateCode(commonpb.StateCode_Initializing)
log.Info("QueryCoord", zap.Any("State", commonpb.StateCode_Initializing))
// Init KV and ID allocator
metaType := Params.MetaStoreCfg.MetaStoreType.GetValue()
var idAllocatorKV kv.TxnKV
log.Info(fmt.Sprintf("query coordinator connecting to %s.", metaType))
if metaType == util.MetaStoreTypeTiKV {
s.kv = tikv.NewTiKV(s.tikvCli, Params.TiKVCfg.MetaRootPath.GetValue())
idAllocatorKV = tsoutil.NewTSOTiKVBase(s.tikvCli, Params.TiKVCfg.KvRootPath.GetValue(), "querycoord-id-allocator")
} else if metaType == util.MetaStoreTypeEtcd {
s.kv = etcdkv.NewEtcdKV(s.etcdCli, Params.EtcdCfg.MetaRootPath.GetValue())
idAllocatorKV = tsoutil.NewTSOKVBase(s.etcdCli, Params.EtcdCfg.KvRootPath.GetValue(), "querycoord-id-allocator")
} else {
return fmt.Errorf("not supported meta store: %s", metaType)
}
log.Info(fmt.Sprintf("query coordinator successfully connected to %s.", metaType))
idAllocator := allocator.NewGlobalIDAllocator("idTimestamp", idAllocatorKV)
err := idAllocator.Initialize()
if err != nil {
log.Error("query coordinator id allocator initialize failed", zap.Error(err))
return err
}
s.idAllocator = func() (int64, error) {
return idAllocator.AllocOne()
}
// Init metrics cache manager
s.metricsCacheManager = metricsinfo.NewMetricsCacheManager()
// Init meta
s.nodeMgr = session.NewNodeManager()
err = s.initMeta()
if err != nil {
return err
}
// Init session
log.Info("init session")
s.cluster = session.NewCluster(s.nodeMgr, s.queryNodeCreator)
// Init schedulers
log.Info("init schedulers")
s.jobScheduler = job.NewScheduler()
s.taskScheduler = task.NewScheduler(
s.ctx,
s.meta,
s.dist,
s.targetMgr,
s.broker,
s.cluster,
s.nodeMgr,
)
// Init heartbeat
log.Info("init dist controller")
s.distController = dist.NewDistController(
s.cluster,
s.nodeMgr,
s.dist,
s.targetMgr,
s.taskScheduler,
)
// Init balancer map and balancer
log.Info("init all available balancer")
s.balancerMap = make(map[string]balance.Balance)
s.balancerMap[balance.RoundRobinBalancerName] = balance.NewRoundRobinBalancer(s.taskScheduler, s.nodeMgr)
s.balancerMap[balance.RowCountBasedBalancerName] = balance.NewRowCountBasedBalancer(s.taskScheduler,
s.nodeMgr, s.dist, s.meta, s.targetMgr)
s.balancerMap[balance.ScoreBasedBalancerName] = balance.NewScoreBasedBalancer(s.taskScheduler,
s.nodeMgr, s.dist, s.meta, s.targetMgr)
if balancer, ok := s.balancerMap[params.Params.QueryCoordCfg.Balancer.GetValue()]; ok {
s.balancer = balancer
log.Info("use config balancer", zap.String("balancer", params.Params.QueryCoordCfg.Balancer.GetValue()))
} else {
s.balancer = s.balancerMap[balance.RowCountBasedBalancerName]
log.Info("use rowCountBased auto balancer")
}
// Init checker controller
log.Info("init checker controller")
s.checkerController = checkers.NewCheckerController(
s.meta,
s.dist,
s.targetMgr,
s.balancer,
s.nodeMgr,
s.taskScheduler,
s.broker,
)
// Init observers
s.initObserver()
// Init load status cache
meta.GlobalFailedLoadCache = meta.NewFailedLoadCache()
log.Info("QueryCoord init success")
return err
}
从代码可以看出初始化是在填充querycoord结构体。
s.start()
启动组件的逻辑。
// start starts QueryCoord's grpc service.
func (s *Server) start() error {
err := s.queryCoord.Register()
if err != nil {
return err
}
return s.queryCoord.Start()
}
s.queryCoord是一个Component接口,实现了 方法Init()、 Start() 、 Stop() 、 Register() 。
Register():向元数据etcd注册。
Start():用来启动组件。
进入s.queryCoord.Start():
func (s *Server) Start() error {
if !s.enableActiveStandBy {
if err := s.startQueryCoord(); err != nil {
return err
}
log.Info("QueryCoord started")
}
return nil
}
真正执行启动逻辑在s.startQueryCoord()。
func (s *Server) startQueryCoord() error {
log.Info("start watcher...")
sessions, revision, err := s.session.GetSessions(typeutil.QueryNodeRole)
if err != nil {
return err
}
for _, node := range sessions {
s.nodeMgr.Add(session.NewNodeInfo(node.ServerID, node.Address))
s.taskScheduler.AddExecutor(node.ServerID)
if node.Stopping {
s.nodeMgr.Stopping(node.ServerID)
}
}
s.checkReplicas()
for _, node := range sessions {
s.handleNodeUp(node.ServerID)
}
s.wg.Add(2)
go s.handleNodeUpLoop()
go s.watchNodes(revision)
// Recover dist, to avoid generate too much task when dist not ready after restart
s.distController.SyncAll(s.ctx)
s.startServerLoop()
s.afterStart()
s.UpdateStateCode(commonpb.StateCode_Healthy)
sessionutil.SaveServerInfo(typeutil.QueryCoordRole, s.session.ServerID)
return nil
}
要详细知道启动querycoord组件做了什么事情,研究这个函数。