datanode启动源码分析
结构体
// DataNode implements DataNode grpc server
// cmd\components\data_node.go
type DataNode struct {
ctx context.Context
svr *grpcdatanode.Server
}
type Server struct {
datanode types.DataNodeComponent
wg sync.WaitGroup
grpcErrChan chan error
grpcServer *grpc.Server
ctx context.Context
cancel context.CancelFunc
etcdCli *clientv3.Client
factory dependency.Factory
serverID atomic.Int64
rootCoord types.RootCoord
dataCoord types.DataCoord
newRootCoordClient func(string, *clientv3.Client) (types.RootCoordClient, error)
newDataCoordClient func(string, *clientv3.Client) (types.DataCoordClient, error)
}
datanode是一个接口,实现datanode api功能。
func (mr *MilvusRoles) runDataNode(ctx context.Context, localMsg bool, wg *sync.WaitGroup) component {
wg.Add(1)
return runComponent(ctx, localMsg, wg, components.NewDataNode, metrics.RegisterDataNode)
}
// creator用NewDataNode替换
role, err = creator(ctx, factory)
components.NewDataNode是一个函数。
NewDataNode()用来创建DataNode结构体。
// NewDataNode creates a new DataNode
func NewDataNode(ctx context.Context, factory dependency.Factory) (*DataNode, error) {
svr, err := grpcdatanode.NewServer(ctx, factory)
if err != nil {
return nil, err
}
return &DataNode{
ctx: ctx,
svr: svr,
}, nil
}
grpcdatanode.NewServer()产生的是本结构体Server。
// NewServer new DataNode grpc server
func NewServer(ctx context.Context, factory dependency.Factory) (*Server, error) {
ctx1, cancel := context.WithCancel(ctx)
s := &Server{
ctx: ctx1,
cancel: cancel,
factory: factory,
grpcErrChan: make(chan error),
newRootCoordClient: func(etcdMetaRoot string, client *clientv3.Client) (types.RootCoordClient, error) {
return rcc.NewClient(ctx1, etcdMetaRoot, client)
},
newDataCoordClient: func(etcdMetaRoot string, client *clientv3.Client) (types.DataCoordClient, error) {
return dcc.NewClient(ctx1, etcdMetaRoot, client)
},
}
s.datanode = dn.NewDataNode(s.ctx, s.factory)
return s, nil
}
dn.NewDataNode()返回一个结构体,是 types.DataNodeComponent接口的一个实现datanode.DataNode结构体。
执行Run()
Server结构体创建后,调用结构体的Run()方法。
func runComponent[T component](ctx context.Context,
localMsg bool,
runWg *sync.WaitGroup,
creator func(context.Context, dependency.Factory) (T, error),
metricRegister func(*prometheus.Registry),
) component {
var role T
sign := make(chan struct{})
go func() {
factory := dependency.NewFactory(localMsg)
var err error
role, err = creator(ctx, factory)
if localMsg {
paramtable.SetRole(typeutil.StandaloneRole)
} else {
paramtable.SetRole(role.GetName())
}
if err != nil {
panic(err)
}
close(sign)
// 在这里调用对应组件结构体的Run()方法,这里DataNode结构体
if err := role.Run(); err != nil {
panic(err)
}
runWg.Done()
}()
......
}
runComponent是一个包裹函数。
// Run starts service
func (d *DataNode) Run() error {
if err := d.svr.Run(); err != nil {
log.Error("DataNode starts error", zap.Error(err))
return err
}
log.Debug("Datanode successfully started")
return nil
}
Run()方法调用d.svr.Run()方法。srv是grpcdatanode.NewServer()返回的结构体。
进入Run()方法:
// Run initializes and starts Datanode's grpc service.
func (s *Server) Run() error {
if err := s.init(); err != nil {
// errors are propagated upstream as panic.
return err
}
log.Info("DataNode gRPC services successfully initialized")
if err := s.start(); err != nil {
// errors are propagated upstream as panic.
return err
}
log.Info("DataNode gRPC services successfully started")
return nil
}
接下来分析s.init()和s.start()方法。
s.init()
// init initializes Datanode's grpc service.
func (s *Server) init() error {
etcdConfig := ¶mtable.Get().EtcdCfg
Params := ¶mtable.Get().DataNodeGrpcServerCfg
ctx := context.Background()
if !funcutil.CheckPortAvailable(Params.Port.GetAsInt()) {
paramtable.Get().Save(Params.Port.Key, fmt.Sprintf("%d", funcutil.GetAvailablePort()))
log.Warn("DataNode found available port during init", zap.Int("port", Params.Port.GetAsInt()))
}
etcdCli, err := etcd.GetEtcdClient(
etcdConfig.UseEmbedEtcd.GetAsBool(),
etcdConfig.EtcdUseSSL.GetAsBool(),
etcdConfig.Endpoints.GetAsStrings(),
etcdConfig.EtcdTLSCert.GetValue(),
etcdConfig.EtcdTLSKey.GetValue(),
etcdConfig.EtcdTLSCACert.GetValue(),
etcdConfig.EtcdTLSMinVersion.GetValue())
if err != nil {
log.Error("failed to connect to etcd", zap.Error(err))
return err
}
s.etcdCli = etcdCli
s.SetEtcdClient(s.etcdCli)
s.datanode.SetAddress(Params.GetAddress())
log.Info("DataNode address", zap.String("address", Params.IP+":"+strconv.Itoa(Params.Port.GetAsInt())))
// 启动gprc服务,默认端口21124
err = s.startGrpc()
if err != nil {
return err
}
// --- RootCoord Client ---
if s.newRootCoordClient != nil {
log.Info("initializing RootCoord client for DataNode")
rootCoordClient, err := s.newRootCoordClient(dn.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
if err != nil {
log.Error("failed to create new RootCoord client", zap.Error(err))
panic(err)
}
if err = componentutil.WaitForComponentHealthy(ctx, rootCoordClient, "RootCoord", 1000000, time.Millisecond*200); err != nil {
log.Error("failed to wait for RootCoord client to be ready", zap.Error(err))
panic(err)
}
log.Info("RootCoord client is ready for DataNode")
if err = s.SetRootCoordInterface(rootCoordClient); err != nil {
panic(err)
}
}
// --- DataCoord Client ---
if s.newDataCoordClient != nil {
log.Debug("starting DataCoord client for DataNode")
dataCoordClient, err := s.newDataCoordClient(dn.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
if err != nil {
log.Error("failed to create new DataCoord client", zap.Error(err))
panic(err)
}
if err = componentutil.WaitForComponentInitOrHealthy(ctx, dataCoordClient, "DataCoord", 1000000, time.Millisecond*200); err != nil {
log.Error("failed to wait for DataCoord client to be ready", zap.Error(err))
panic(err)
}
log.Info("DataCoord client is ready for DataNode")
if err = s.SetDataCoordInterface(dataCoordClient); err != nil {
panic(err)
}
}
s.datanode.UpdateStateCode(commonpb.StateCode_Initializing)
if err := s.datanode.Init(); err != nil {
log.Error("failed to init DataNode server", zap.Error(err))
return err
}
log.Info("current DataNode state", zap.Any("state", s.datanode.GetStateCode()))
return nil
}
这段可以看出来,创建了etcdCli并赋予给了s.etcdCli。
s.startGrpc()启动grpc端口服务。
最终调用s.datanode.Init()进行初始化,代码位置:internal\datanode\data_node.go
s.datanode是接口类型types.DataNodeComponent,DataNodeComponent继承于Component。
type DataNodeComponent interface {
DataNode
UpdateStateCode(stateCode commonpb.StateCode)
GetStateCode() commonpb.StateCode
SetAddress(address string)
GetAddress() string
SetEtcdClient(etcdClient *clientv3.Client)
SetRootCoordClient(rootCoord RootCoordClient) error
SetDataCoordClient(dataCoord DataCoordClient) error
}
type DataNode interface {
Component
datapb.DataNodeServer
}
// Component is the interface all services implement
type Component interface {
Init() error
Start() error
Stop() error
Register() error
}
接口套接口:
RootCoordComponent -> RootCoord -> Component
DataCoordComponent -> DataCoord -> Component
QueryCoordComponent -> QueryCoord -> Component
ProxyComponent -> Proxy -> Component
QueryNodeComponent -> QueryNode -> Component
IndexNodeComponent -> IndexNode -> Component
DataNodeComponent -> DataNode -> Component
各组件最终的Init()初始化代码路径:
internal\rootcoord\root_coord.go->Init()
internal\datacoord\server.go->Init()
internal\querycoordv2\server.go->Init()
internal\datanode\data_node.go->Init()
internal\indexnode\indexnode.go->Init()
internal\querynodev2\server.go->Init()
internal\proxy\proxy.go->Init()
回过头来继续datanode的init。
func (node *DataNode) Init() error {
var initError error
node.initOnce.Do(func() {
logutil.Logger(node.ctx).Info("DataNode server initializing",
zap.String("TimeTickChannelName", Params.CommonCfg.DataCoordTimeTick.GetValue()),
)
if err := node.initSession(); err != nil {
log.Error("DataNode server init session failed", zap.Error(err))
initError = err
return
}
node.broker = broker.NewCoordBroker(node.rootCoord, node.dataCoord)
err := node.initRateCollector()
if err != nil {
log.Error("DataNode server init rateCollector failed", zap.Int64("node ID", paramtable.GetNodeID()), zap.Error(err))
initError = err
return
}
log.Info("DataNode server init rateCollector done", zap.Int64("node ID", paramtable.GetNodeID()))
node.dispClient = msgdispatcher.NewClient(node.factory, typeutil.DataNodeRole, paramtable.GetNodeID())
log.Info("DataNode server init dispatcher client done", zap.Int64("node ID", paramtable.GetNodeID()))
alloc, err := allocator.New(context.Background(), node.rootCoord, paramtable.GetNodeID())
if err != nil {
log.Error("failed to create id allocator",
zap.Error(err),
zap.String("role", typeutil.DataNodeRole), zap.Int64("DataNode ID", paramtable.GetNodeID()))
initError = err
return
}
node.allocator = alloc
node.factory.Init(Params)
log.Info("DataNode server init succeeded",
zap.String("MsgChannelSubName", Params.CommonCfg.DataNodeSubName.GetValue()))
})
return initError
}
从代码可以看出初始化是在填充DataNode结构体。
s.start()
启动组件的逻辑。
// start starts datanode's grpc service.
func (s *Server) start() error {
if err := s.datanode.Start(); err != nil {
return err
}
err := s.datanode.Register()
if err != nil {
log.Debug("failed to register to Etcd", zap.Error(err))
return err
}
return nil
}
s.datanode是一个Component接口,实现了 方法Init()、 Start() 、 Stop() 、 Register() 。
Register():向元数据etcd注册。
Start():用来启动组件。
// Start will update DataNode state to HEALTHY
func (node *DataNode) Start() error {
var startErr error
node.startOnce.Do(func() {
if err := node.allocator.Start(); err != nil {
log.Error("failed to start id allocator", zap.Error(err), zap.String("role", typeutil.DataNodeRole))
startErr = err
return
}
log.Info("start id allocator done", zap.String("role", typeutil.DataNodeRole))
connectEtcdFn := func() error {
etcdKV := etcdkv.NewEtcdKV(node.etcdCli, Params.EtcdCfg.MetaRootPath.GetValue())
node.watchKv = etcdKV
return nil
}
err := retry.Do(node.ctx, connectEtcdFn, retry.Attempts(ConnectEtcdMaxRetryTime))
if err != nil {
startErr = errors.New("DataNode fail to connect etcd")
return
}
chunkManager, err := node.factory.NewPersistentStorageChunkManager(node.ctx)
if err != nil {
startErr = err
return
}
node.chunkManager = chunkManager
node.stopWaiter.Add(1)
go node.BackGroundGC(node.clearSignal)
go node.compactionExecutor.start(node.ctx)
if Params.DataNodeCfg.DataNodeTimeTickByRPC.GetAsBool() {
node.timeTickSender = newTimeTickSender(node.broker, node.session.ServerID)
go node.timeTickSender.start(node.ctx)
}
node.stopWaiter.Add(1)
// Start node watch node
go node.StartWatchChannels(node.ctx)
node.stopWaiter.Add(1)
go node.flowgraphManager.start(&node.stopWaiter)
node.UpdateStateCode(commonpb.StateCode_Healthy)
})
return startErr
}
node节点都没有standby,coord节点有standby。