milvus datanode启动源码分析

datanode启动源码分析

结构体

// DataNode implements DataNode grpc server
// cmd\components\data_node.go
type DataNode struct {
	ctx context.Context
	svr *grpcdatanode.Server
}

type Server struct {
	datanode    types.DataNodeComponent
	wg          sync.WaitGroup
	grpcErrChan chan error
	grpcServer  *grpc.Server
	ctx         context.Context
	cancel      context.CancelFunc
	etcdCli     *clientv3.Client
	factory     dependency.Factory

	serverID atomic.Int64

	rootCoord types.RootCoord
	dataCoord types.DataCoord

	newRootCoordClient func(string, *clientv3.Client) (types.RootCoordClient, error)
	newDataCoordClient func(string, *clientv3.Client) (types.DataCoordClient, error)
}

datanode是一个接口,实现datanode api功能。

func (mr *MilvusRoles) runDataNode(ctx context.Context, localMsg bool, wg *sync.WaitGroup) component {
	wg.Add(1)
	return runComponent(ctx, localMsg, wg, components.NewDataNode, metrics.RegisterDataNode)
}

// creator用NewDataNode替换
role, err = creator(ctx, factory)

components.NewDataNode是一个函数。

NewDataNode()用来创建DataNode结构体。

// NewDataNode creates a new DataNode
func NewDataNode(ctx context.Context, factory dependency.Factory) (*DataNode, error) {
	svr, err := grpcdatanode.NewServer(ctx, factory)
	if err != nil {
		return nil, err
	}

	return &DataNode{
		ctx: ctx,
		svr: svr,
	}, nil
}

grpcdatanode.NewServer()产生的是本结构体Server。

// NewServer new DataNode grpc server
func NewServer(ctx context.Context, factory dependency.Factory) (*Server, error) {
	ctx1, cancel := context.WithCancel(ctx)
	s := &Server{
		ctx:         ctx1,
		cancel:      cancel,
		factory:     factory,
		grpcErrChan: make(chan error),
		newRootCoordClient: func(etcdMetaRoot string, client *clientv3.Client) (types.RootCoordClient, error) {
			return rcc.NewClient(ctx1, etcdMetaRoot, client)
		},
		newDataCoordClient: func(etcdMetaRoot string, client *clientv3.Client) (types.DataCoordClient, error) {
			return dcc.NewClient(ctx1, etcdMetaRoot, client)
		},
	}

	s.datanode = dn.NewDataNode(s.ctx, s.factory)

	return s, nil
}

dn.NewDataNode()返回一个结构体,是 types.DataNodeComponent接口的一个实现datanode.DataNode结构体。

执行Run()

Server结构体创建后,调用结构体的Run()方法。

func runComponent[T component](ctx context.Context,
	localMsg bool,
	runWg *sync.WaitGroup,
	creator func(context.Context, dependency.Factory) (T, error),
	metricRegister func(*prometheus.Registry),
) component {
	var role T

	sign := make(chan struct{})
	go func() {
		factory := dependency.NewFactory(localMsg)
		var err error
		role, err = creator(ctx, factory)
		if localMsg {
			paramtable.SetRole(typeutil.StandaloneRole)
		} else {
			paramtable.SetRole(role.GetName())
		}
		if err != nil {
			panic(err)
		}
		close(sign)
        // 在这里调用对应组件结构体的Run()方法,这里DataNode结构体
		if err := role.Run(); err != nil {
			panic(err)
		}
		runWg.Done()
	}()
    ......
}

runComponent是一个包裹函数。

// Run starts service
func (d *DataNode) Run() error {
	if err := d.svr.Run(); err != nil {
		log.Error("DataNode starts error", zap.Error(err))
		return err
	}
	log.Debug("Datanode successfully started")
	return nil
}

Run()方法调用d.svr.Run()方法。srv是grpcdatanode.NewServer()返回的结构体。

进入Run()方法:

// Run initializes and starts Datanode's grpc service.
func (s *Server) Run() error {
	if err := s.init(); err != nil {
		// errors are propagated upstream as panic.
		return err
	}
	log.Info("DataNode gRPC services successfully initialized")
	if err := s.start(); err != nil {
		// errors are propagated upstream as panic.
		return err
	}
	log.Info("DataNode gRPC services successfully started")
	return nil
}

接下来分析s.init()和s.start()方法。

s.init()

// init initializes Datanode's grpc service.
func (s *Server) init() error {
	etcdConfig := &paramtable.Get().EtcdCfg
	Params := &paramtable.Get().DataNodeGrpcServerCfg
	ctx := context.Background()
	if !funcutil.CheckPortAvailable(Params.Port.GetAsInt()) {
		paramtable.Get().Save(Params.Port.Key, fmt.Sprintf("%d", funcutil.GetAvailablePort()))
		log.Warn("DataNode found available port during init", zap.Int("port", Params.Port.GetAsInt()))
	}

	etcdCli, err := etcd.GetEtcdClient(
		etcdConfig.UseEmbedEtcd.GetAsBool(),
		etcdConfig.EtcdUseSSL.GetAsBool(),
		etcdConfig.Endpoints.GetAsStrings(),
		etcdConfig.EtcdTLSCert.GetValue(),
		etcdConfig.EtcdTLSKey.GetValue(),
		etcdConfig.EtcdTLSCACert.GetValue(),
		etcdConfig.EtcdTLSMinVersion.GetValue())
	if err != nil {
		log.Error("failed to connect to etcd", zap.Error(err))
		return err
	}
	s.etcdCli = etcdCli
	s.SetEtcdClient(s.etcdCli)
	s.datanode.SetAddress(Params.GetAddress())
	log.Info("DataNode address", zap.String("address", Params.IP+":"+strconv.Itoa(Params.Port.GetAsInt())))
    // 启动gprc服务,默认端口21124
	err = s.startGrpc()
	if err != nil {
		return err
	}

	// --- RootCoord Client ---
	if s.newRootCoordClient != nil {
		log.Info("initializing RootCoord client for DataNode")
		rootCoordClient, err := s.newRootCoordClient(dn.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
		if err != nil {
			log.Error("failed to create new RootCoord client", zap.Error(err))
			panic(err)
		}

		if err = componentutil.WaitForComponentHealthy(ctx, rootCoordClient, "RootCoord", 1000000, time.Millisecond*200); err != nil {
			log.Error("failed to wait for RootCoord client to be ready", zap.Error(err))
			panic(err)
		}
		log.Info("RootCoord client is ready for DataNode")
		if err = s.SetRootCoordInterface(rootCoordClient); err != nil {
			panic(err)
		}
	}

	// --- DataCoord Client ---
	if s.newDataCoordClient != nil {
		log.Debug("starting DataCoord client for DataNode")
		dataCoordClient, err := s.newDataCoordClient(dn.Params.EtcdCfg.MetaRootPath.GetValue(), s.etcdCli)
		if err != nil {
			log.Error("failed to create new DataCoord client", zap.Error(err))
			panic(err)
		}

		if err = componentutil.WaitForComponentInitOrHealthy(ctx, dataCoordClient, "DataCoord", 1000000, time.Millisecond*200); err != nil {
			log.Error("failed to wait for DataCoord client to be ready", zap.Error(err))
			panic(err)
		}
		log.Info("DataCoord client is ready for DataNode")
		if err = s.SetDataCoordInterface(dataCoordClient); err != nil {
			panic(err)
		}
	}

	s.datanode.UpdateStateCode(commonpb.StateCode_Initializing)

	if err := s.datanode.Init(); err != nil {
		log.Error("failed to init DataNode server", zap.Error(err))
		return err
	}
	log.Info("current DataNode state", zap.Any("state", s.datanode.GetStateCode()))
	return nil
}

这段可以看出来,创建了etcdCli并赋予给了s.etcdCli。

s.startGrpc()启动grpc端口服务。

最终调用s.datanode.Init()进行初始化,代码位置:internal\datanode\data_node.go

s.datanode是接口类型types.DataNodeComponent,DataNodeComponent继承于Component。

type DataNodeComponent interface {
    DataNode
    UpdateStateCode(stateCode commonpb.StateCode)
    GetStateCode() commonpb.StateCode
    SetAddress(address string)
    GetAddress() string
    SetEtcdClient(etcdClient *clientv3.Client)
    SetRootCoordClient(rootCoord RootCoordClient) error
    SetDataCoordClient(dataCoord DataCoordClient) error
}

type DataNode interface {
    Component
    datapb.DataNodeServer
}

// Component is the interface all services implement
type Component interface {
	Init() error
	Start() error
	Stop() error
	Register() error
}

接口套接口:

RootCoordComponent -> RootCoord -> Component
DataCoordComponent -> DataCoord -> Component
QueryCoordComponent -> QueryCoord -> Component
ProxyComponent -> Proxy -> Component
QueryNodeComponent -> QueryNode -> Component
IndexNodeComponent -> IndexNode -> Component
DataNodeComponent -> DataNode -> Component

各组件最终的Init()初始化代码路径:

internal\rootcoord\root_coord.go->Init()
internal\datacoord\server.go->Init()
internal\querycoordv2\server.go->Init()
internal\datanode\data_node.go->Init()
internal\indexnode\indexnode.go->Init()
internal\querynodev2\server.go->Init()
internal\proxy\proxy.go->Init()

回过头来继续datanode的init。

func (node *DataNode) Init() error {
	var initError error
	node.initOnce.Do(func() {
		logutil.Logger(node.ctx).Info("DataNode server initializing",
			zap.String("TimeTickChannelName", Params.CommonCfg.DataCoordTimeTick.GetValue()),
		)
		if err := node.initSession(); err != nil {
			log.Error("DataNode server init session failed", zap.Error(err))
			initError = err
			return
		}

		node.broker = broker.NewCoordBroker(node.rootCoord, node.dataCoord)

		err := node.initRateCollector()
		if err != nil {
			log.Error("DataNode server init rateCollector failed", zap.Int64("node ID", paramtable.GetNodeID()), zap.Error(err))
			initError = err
			return
		}
		log.Info("DataNode server init rateCollector done", zap.Int64("node ID", paramtable.GetNodeID()))

		node.dispClient = msgdispatcher.NewClient(node.factory, typeutil.DataNodeRole, paramtable.GetNodeID())
		log.Info("DataNode server init dispatcher client done", zap.Int64("node ID", paramtable.GetNodeID()))

		alloc, err := allocator.New(context.Background(), node.rootCoord, paramtable.GetNodeID())
		if err != nil {
			log.Error("failed to create id allocator",
				zap.Error(err),
				zap.String("role", typeutil.DataNodeRole), zap.Int64("DataNode ID", paramtable.GetNodeID()))
			initError = err
			return
		}
		node.allocator = alloc

		node.factory.Init(Params)
		log.Info("DataNode server init succeeded",
			zap.String("MsgChannelSubName", Params.CommonCfg.DataNodeSubName.GetValue()))
	})
	return initError
}

从代码可以看出初始化是在填充DataNode结构体。

s.start()

启动组件的逻辑。

// start starts datanode's grpc service.
func (s *Server) start() error {
	if err := s.datanode.Start(); err != nil {
		return err
	}
	err := s.datanode.Register()
	if err != nil {
		log.Debug("failed to register to Etcd", zap.Error(err))
		return err
	}
	return nil
}

s.datanode是一个Component接口,实现了 方法Init()、 Start() 、 Stop() 、 Register() 。

Register():向元数据etcd注册。

Start():用来启动组件。

// Start will update DataNode state to HEALTHY
func (node *DataNode) Start() error {
	var startErr error
	node.startOnce.Do(func() {
		if err := node.allocator.Start(); err != nil {
			log.Error("failed to start id allocator", zap.Error(err), zap.String("role", typeutil.DataNodeRole))
			startErr = err
			return
		}
		log.Info("start id allocator done", zap.String("role", typeutil.DataNodeRole))

		connectEtcdFn := func() error {
			etcdKV := etcdkv.NewEtcdKV(node.etcdCli, Params.EtcdCfg.MetaRootPath.GetValue())
			node.watchKv = etcdKV
			return nil
		}
		err := retry.Do(node.ctx, connectEtcdFn, retry.Attempts(ConnectEtcdMaxRetryTime))
		if err != nil {
			startErr = errors.New("DataNode fail to connect etcd")
			return
		}

		chunkManager, err := node.factory.NewPersistentStorageChunkManager(node.ctx)
		if err != nil {
			startErr = err
			return
		}

		node.chunkManager = chunkManager

		node.stopWaiter.Add(1)
		go node.BackGroundGC(node.clearSignal)

		go node.compactionExecutor.start(node.ctx)

		if Params.DataNodeCfg.DataNodeTimeTickByRPC.GetAsBool() {
			node.timeTickSender = newTimeTickSender(node.broker, node.session.ServerID)
			go node.timeTickSender.start(node.ctx)
		}

		node.stopWaiter.Add(1)
		// Start node watch node
		go node.StartWatchChannels(node.ctx)

		node.stopWaiter.Add(1)
		go node.flowgraphManager.start(&node.stopWaiter)

		node.UpdateStateCode(commonpb.StateCode_Healthy)
	})
	return startErr
}

node节点都没有standby,coord节点有standby。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

shulu

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值