etcd源码raft

raft算法细节部分的文档还未整理好.找个时间再更新

创建etcdserver

创建一个etcdserver的实例
etcdserver.NewServer
启动服务
e.Server.Start()

etcd/embed/etcd.go

func StartEtcd(inCfg *Config) (e *Etcd, err error) {
	…省略其它代码
    if e.Server, err = etcdserver.NewServer(srvcfg); err != nil {
        return e, err
    }

    …省略其它代码
    e.Server.Start()
	…省略其它代码
}

创建节点,初始化节点信息,初始化http服务

创建节点startNode
初始化http服务Transport
并且添加其它节点
etcd/etcdserver/server.go

func NewServer(cfg ServerConfig) (srv *EtcdServer, err error) {
	...省略其它代码
	id, n, s, w = startNode(cfg, cl, nil)
	...省略其它代码
	srv = &EtcdServer{
        readych:     make(chan struct{}),
        Cfg:         cfg,
        lgMu:        new(sync.RWMutex),
        lg:          cfg.Logger,
        errorc:      make(chan error, 1),
        v2store:     st,
        snapshotter: ss,

	//创建raftNode
        r: *newRaftNode(
            raftNodeConfig{
                lg:          cfg.Logger,
                isIDRemoved: func(id uint64) bool { return cl.IsIDRemoved(types.ID(id)) },
                Node:        n,
                heartbeat:   heartbeat,
                raftStorage: s,
                storage:     NewStorage(w, ss),
            },
        ),
        id:               id,
        attributes:       membership.Attributes{Name: cfg.Name, ClientURLs: cfg.ClientURLs.StringSlice()},
        cluster:          cl,
        stats:            sstats,
        lstats:           lstats,
        SyncTicker:       time.NewTicker(500 * time.Millisecond),
        peerRt:           prt,
        reqIDGen:         idutil.NewGenerator(uint16(id), time.Now()),
        forceVersionC:    make(chan struct{}),
        AccessController: &AccessController{CORS: cfg.CORS, HostWhitelist: cfg.HostWhitelist},
    }

	…省略其它代码

// TODO: move transport initialization near the definition of remote
    tr := &rafthttp.Transport{
        Logger:      cfg.Logger,
        TLSInfo:     cfg.PeerTLSInfo,
        DialTimeout: cfg.peerDialTimeout(),
        ID:          id,    //当前节点自己的ID
        URLs:        cfg.PeerURLs,  //当前节点与集群中其他节点交互时使用的URL地址
        ClusterID:   cl.ID(),   //当前节点所在的集群的ID
        Raft:        srv,   //raft状态机
        Snapshotter: ss,    //负责管理快照文件
        ServerStats: sstats,    //用于统计一般的transportation统计
        LeaderStats: lstats,    //raft协议中的leader节点统计followers节点的 transportation 状态
        ErrorC:      srv.errorc,
    }
    if err = tr.Start(); err != nil {
        return nil, err
    }
    // add all remotes into transport
    for _, m := range remotes {
        if m.ID != id {
            tr.AddRemote(m.ID, m.PeerURLs)
        }
    }
    for _, m := range cl.Members() {
        if m.ID != id {
            tr.AddPeer(m.ID, m.PeerURLs)
        }
    }
    srv.r.transport = tr

    return srv, nil


}

根据配置启动一个node

etcd/etcdserver/raft.go

func startNode(cfg ServerConfig, cl *membership.RaftCluster, ids []types.ID) (id types.ID, n raft.Node, s *raft.MemoryStorage, w *wal.WAL) {
    if len(peers) == 0 {
        n = raft.RestartNode(c)
    } else {
        n = raft.StartNode(c, peers)
    }
}

node提供了一个基础对外接口,并且同时启一个goroutine 处理各个状态机之间通信
etcd/raft/node.go

func StartNode(c *Config, peers []Peer) Node {
    if len(peers) == 0 {
        panic("no peers given; use RestartNode instead")
    }
    rn, err := NewRawNode(c)
    if err != nil {
        panic(err)
    }
    rn.Bootstrap(peers)

    n := newNode(rn)

    go n.run()
    return &n
}


func (n *node) run() {
    var propc chan msgWithResult
    var readyc chan Ready
    var advancec chan struct{}
    var rd Ready

    r := n.rn.raft

    lead := None

    for {
        if advancec != nil {
            readyc = nil
        } else if n.rn.HasReady() {//判断是否有消息
            rd = n.rn.readyWithoutAccept()//构造消息
            readyc = n.readyc
        }

        if lead != r.lead {
            if r.hasLeader() {
                if lead == None {
                    r.logger.Infof("raft.node: %x elected leader %x at term %d", r.id, r.lead, r.Term)
                } else {
                    r.logger.Infof("raft.node: %x changed leader from %x to %x at term %d", r.id, lead, r.lead, r.Term)
                }
                propc = n.propc
            } else {
                r.logger.Infof("raft.node: %x lost leader %x at term %d", r.id, lead, r.Term)
                propc = nil
            }
            lead = r.lead
        }

        select {
        case pm := <-propc://接收到写消息 其他节点通过监听propc channel获取其他节点发送的投票消息,并调用Step对消息进行判断,选择是否投票
            m := pm.m
            m.From = r.id
            err := r.Step(m)
            if pm.result != nil {
                pm.result <- err
                close(pm.result)
            }
        case m := <-n.recvc://接收到readindex 请求
            if pr := r.prs.Progress[m.From]; pr != nil || !IsResponseMsg(m.Type) {
                r.Step(m)
            }
        case cc := <-n.confc://配置变更
            _, okBefore := r.prs.Progress[r.id]
            cs := r.applyConfChange(cc)
            if _, okAfter := r.prs.Progress[r.id]; okBefore && !okAfter {
                var found bool
                for _, sl := range [][]uint64{cs.Voters, cs.VotersOutgoing} {
                    for _, id := range sl {
                        if id == r.id {
                            found = true
                        }
                    }
                }
                if !found {
                    propc = nil
                }
            }
            select {
            case n.confstatec <- cs:
            case <-n.done:
            }
        case <-n.tickc://超时时间到,包括心跳超时和选举超时等
            n.rn.Tick()
        case readyc <- rd://数据ready
            n.rn.acceptReady(rd)
            advancec = n.advancec
        case <-advancec://可以进行状态变更和日志提交
            n.rn.Advance(rd)
            rd = Ready{}
            advancec = nil
        case c := <-n.status://节点状态信号
            c <- getStatus(r)
        case <-n.stop://收到停止信号
            close(n.done)
            return
        }
    }
}
//触发时钟事件
func (n *node) Tick() {
	…略
}

/TODO 为外界提供了日志提交接口 Propose  客户端写请求消息类型 pb.MsgProp
//阻塞等待该用户请求被RAFT状态机接受
func (n *node) Propose(ctx context.Context, data []byte) error {
    …略
}


状态传输服务

  1. Transport 结构体,主要是封装了各个节点之间状态传输
  2. 创建多路复用器 ServeMux
  3. 创建pipelineHandler、streamHandler 、snapHandler 三个实例,这三个实例都实现了Handler接口

etcd/etcdserver/api/rafthttp/transport.go

type Transport struct {
    Logger *zap.Logger
    DialTimeout time.Duration // maximum duration before timing out dial of the request
    DialRetryFrequency rate.Limit
    TLSInfo transport.TLSInfo // TLS information used when creating connection
    ID          types.ID           // local member ID  当前节点自己的ID
    URLs        types.URLs         // local peer URLs 当前节点与集群中其他节点交互时使用的URL地址
    ClusterID   types.ID           // raft cluster ID for request validation  当前节点所在的集群的ID
    Raft        Raft               // raft state machine, to which the Transport forwards received messages and reports status
    Snapshotter *snap.Snapshotter  //负责管理快照文件
    ServerStats *stats.ServerStats // used to record general transportation statistics 用于统计一般的transportation统计
    LeaderStats *stats.LeaderStats //raft协议中的leader节点统计followers节点的 transportation 状态
    ErrorC chan error

    streamRt   http.RoundTripper // roundTripper used by streams  Stream消息通道中使用http.RoundTripper实例,HTTP长连接
    pipelineRt http.RoundTripper // roundTripper used by pipelines Pipeline消息通道中使用的http.RoundTripper实例,传输完成后会立即关闭连接,传输数据量较大、发送频率较低的消息,如MsgSnap消息

    mu      sync.RWMutex         // protect the remote and peer map
    //remote中只封装了pipeline实例,remote主要负责发送快照数据,帮助新加入的节点快速追上其他节点的数据
    remotes map[types.ID]*remote // remotes map that helps newly joined member to catch up
    /*
    Peer接口是当前节点对集群中其他节点的抽象表示。对于当前节点来说,集群中其他节点在本地都会有一个Peer实例与之对应,
    peers字段维护了节点ID到对应Peer实例之间的映射关系
    */
    peers   map[types.ID]Peer    // peers map
//用于探测Pipeline消息通道是否可用
    pipelineProber probing.Prober
    streamProber   probing.Prober
}


//TODO 启动HTTP服务
func (t *Transport) Start() error {
    …省略其它代码
}

func (t *Transport) Handler() http.Handler {
    //创建pipelineHandler、streamHandler 、snapHandler 三个实例,这三个实例都实现了Handler接口
    pipelineHandler := newPipelineHandler(t, t.Raft, t.ClusterID)
    streamHandler := newStreamHandler(t, t, t.Raft, t.ID, t.ClusterID)
    snapHandler := newSnapshotHandler(t, t.Raft, t.Snapshotter, t.ClusterID)
    mux := http.NewServeMux()//mux是多路复用器 ServeMux主要通过m字段(map[string]muxEntry)存储URL和Handler实例之间的映射关系,设置URL和Handler之间的对应关系
    mux.Handle(RaftPrefix, pipelineHandler)
    mux.Handle(RaftStreamPrefix+"/", streamHandler)
    mux.Handle(RaftSnapshotPrefix, snapHandler)
    mux.Handle(ProbingPrefix, probing.NewHandler())
    return mux
}


…省略其它代码


//TODO 添加对端服务,如果是三个节点,会添加两个
func (t *Transport) AddPeer(id types.ID, us []string) {
    t.mu.Lock()
    defer t.mu.Unlock()

    if t.peers == nil {
        panic("transport stopped")
    }
    if _, ok := t.peers[id]; ok {
        return
    }
    urls, err := types.NewURLs(us)
    if err != nil {
        if t.Logger != nil {
            t.Logger.Panic("failed NewURLs", zap.Strings("urls", us), zap.Error(err))
        } else {
            plog.Panicf("newURLs %+v should never fail: %+v", us, err)
        }
    }
    fs := t.LeaderStats.Follower(id.String())
    t.peers[id] = startPeer(t, urls, id, fs) //starting peer
    addPeerToProber(t.Logger, t.pipelineProber, id.String(), us, RoundTripperNameSnapshot, rttSec)
    addPeerToProber(t.Logger, t.streamProber, id.String(), us, RoundTripperNameRaftMessage, rttSec)

    if t.Logger != nil {
        t.Logger.Info(
            "added remote peer",
            zap.String("local-member-id", t.ID.String()),
            zap.String("remote-peer-id", id.String()),
            zap.Strings("remote-peer-urls", us),
        )
    } else {
        plog.Infof("added peer %s", id)
    }
}


在startPeer中,声明一个Raft状态机实例,并启动goroutine通过channel进行通信
etcd/etcdserver/api/rafthttp/peer.go

type peer struct {
    lg *zap.Logger

    localID types.ID //当前节点ID
    // id of the remote raft peer node
    id types.ID //该peer实例对应的节点ID,对端ID

    r Raft

    status *peerStatus
    /*
        每个节点可能提供了多个URL供其他节点正常访问,当其中一个访问失败时,我们应该可以尝试访问另一个。
        urlPicker提供的主要功能就是在这些URL之间进行切换
    */
    picker *urlPicker

    msgAppV2Writer *streamWriter
    writer         *streamWriter   //负责向Stream消息通道中写消息
    pipeline       *pipeline       //pipeline消息通道
    snapSender     *snapshotSender // snapshot sender to send v3 snapshot messages
    msgAppV2Reader *streamReader
    msgAppReader   *streamReader //负责从Stream消息通道中读消息

    recvc chan raftpb.Message //从Stream消息通道中读取到消息之后,会通过该通道将消息交给Raft接口,然后由它返回给底层etcd-raft模块进行处理
    propc chan raftpb.Message //从Stream消息通道中读取到MsgProp类型的消息之后,会通过该通道将MsgApp消息交给Raft接口,然后由它返回给底层的etcd-raft模块进行处理

    mu     sync.Mutex
    paused bool //是否暂停向其他节点发送消息

    cancel context.CancelFunc // cancel pending works in go routine created by peer.
    stopc  chan struct{}
}



func startPeer(t *Transport, urls types.URLs, peerID types.ID, fs *stats.FollowerStats) *peer {
    if t.Logger != nil {
        t.Logger.Info("starting remote peer", zap.String("remote-peer-id", peerID.String()))
    } else {
        plog.Infof("starting peer %s...", peerID)
    }
    defer func() {
        if t.Logger != nil {
            t.Logger.Info("started remote peer", zap.String("remote-peer-id", peerID.String()))
        } else {
            plog.Infof("started peer %s", peerID)
        }
    }()

    status := newPeerStatus(t.Logger, t.ID, peerID)//创建节点的状态信息  status
    picker := newURLPicker(urls)//根据节点提供的URL创建urlPicker
    errorc := t.ErrorC
    r := t.Raft //底层的Raft状态机
    pipeline := &pipeline{
        peerID:        peerID,
        tr:            t,
        picker:        picker,
        status:        status,
        followerStats: fs,
        raft:          r,
        errorc:        errorc,
    }
    pipeline.start() //这里会启动一个协程处理

    p := &peer{
        lg:             t.Logger,
        localID:        t.ID,
        id:             peerID,
        r:              r,
        status:         status,
        picker:         picker,
        msgAppV2Writer: startStreamWriter(t.Logger, t.ID, peerID, status, fs, r),//创建并启动streamWriter
        writer:         startStreamWriter(t.Logger, t.ID, peerID, status, fs, r),
        pipeline:       pipeline,
        snapSender:     newSnapshotSender(t, picker, peerID, status),
        recvc:          make(chan raftpb.Message, recvBufSize),//创建recvc通道
        propc:          make(chan raftpb.Message, maxPendingProposals),//创建propc通道
        stopc:          make(chan struct{}),
    }
    //启动单独的goroutine,它负责将recvc通道中读取消息,该通道中的消息就是从对端节点发送过来的消息,
    // 然后将读取到的消息交给底层的Raft状态机进行处理
    ctx, cancel := context.WithCancel(context.Background())
    p.cancel = cancel
    go func() {
        for {
            select {
            case mm := <-p.recvc://从recvc通道中获取连接上读取到的消息
                //TODO 调用process
                if err := r.Process(ctx, mm); err != nil {//将Message交给底层Raft状态机处理
                    if t.Logger != nil {
                        t.Logger.Warn("failed to process Raft message", zap.Error(err))
                    } else {
                        plog.Warningf("failed to process raft message (%v)", err)
                    }
                }
            case <-p.stopc:
                return
            }
        }
    }()

    // r.Process might block for processing proposal when there is no leader.
    // Thus propc must be put into a separate routine with recvc to avoid blocking
    // processing other raft messages.
    //在底层的Raft状态机处理MsgProp类型的消息时,可能会阻塞,所以启动单独的goroutine来处理
    go func() {
        for {
            select {
            case mm := <-p.propc://从propc通道中获取MsgProp类型的Message
                if err := r.Process(ctx, mm); err != nil {
                    plog.Warningf("failed to process raft message (%v)", err)
                }
            case <-p.stopc:
                return
            }
        }
    }()
//创建并启动streamReader实例,主要负责从Stream消息通道上读取消息
    p.msgAppV2Reader = &streamReader{
        lg:     t.Logger,
        peerID: peerID,
        typ:    streamTypeMsgAppV2,
        tr:     t,
        picker: picker,
        status: status,
        recvc:  p.recvc,
        propc:  p.propc,
        rl:     rate.NewLimiter(t.DialRetryFrequency, 1),
    }
    p.msgAppReader = &streamReader{
        lg:     t.Logger,
        peerID: peerID,
        typ:    streamTypeMessage,
        tr:     t,
        picker: picker,
        status: status,
        recvc:  p.recvc,
        propc:  p.propc,
        rl:     rate.NewLimiter(t.DialRetryFrequency, 1),
    }

    p.msgAppV2Reader.start()
    p.msgAppReader.start()

    return p
}

上面创建了一个pipeline,并调用了它的start方法,处理消息raft状态机的消息发送与返回结果

type pipeline struct {
    peerID types.ID //该pipeline对应节点的ID

    tr     *Transport  //关联的rafthttp.Transport实例
    picker *urlPicker  //用于选择可用的url
    status *peerStatus //当前peer的状态
    raft   Raft
    errorc chan error
    // deprecate when we depercate v2 API
    followerStats *stats.FollowerStats

    msgc chan raftpb.Message //pipeline实例从该通道中获取待发送的消息
    // wait for the handling routines
    wg    sync.WaitGroup //负责同步多个goroutine结束。每个pipeline默认开启4个goroutine来处理msgc中的消息,必须先关闭这些goroutine,才能真正关闭该pipeline
    stopc chan struct{}
}

func (p *pipeline) start() {
    p.stopc = make(chan struct{})
    p.msgc = make(chan raftpb.Message, pipelineBufSize)//初始化msgc通道,默认缓冲是64个
    p.wg.Add(connPerPipeline)
    for i := 0; i < connPerPipeline; i++ {//默认开启4个goroutine来处理msgc中待发送的消息
        go p.handle()//并将消息发送给对端节点
    }

    if p.tr != nil && p.tr.Logger != nil {
        p.tr.Logger.Info(
            "started HTTP pipelining with remote peer",
            zap.String("local-member-id", p.tr.ID.String()),
            zap.String("remote-peer-id", p.peerID.String()),
        )
    } else {
        plog.Infof("started HTTP pipelining with peer %s", p.peerID)
    }
}
 …省略其它代码
//下面是发送消息相关
//循环处理msgc通道中待发送的消息,然后调用pipeline.post()方法将其发送出去,发送结束之后会调用底层的Raft接口的响应方法报告发送结果
func (p *pipeline) handle() {
    defer p.wg.Done()

    for {
        select {
        case m := <-p.msgc://获取待发送的MsgSnap类型的消息
            start := time.Now()
            err := p.post(pbutil.MustMarshal(&m))//将消息序列化,然后创建HTTP请求并发送出去
            end := time.Now()

            if err != nil {
                //通知不可达
                p.status.deactivate(failureType{source: pipelineMsg, action: "write"}, err.Error())

                if m.Type == raftpb.MsgApp && p.followerStats != nil {
                    p.followerStats.Fail()
                }
                p.raft.ReportUnreachable(m.To)//通知底层的etcd-raft模块,当前节点与指定的节点无法连通
                if isMsgSnap(m) {//快照数据则向状态机报告发送失败
                    p.raft.ReportSnapshot(m.To, raft.SnapshotFailure)
                }
                sentFailures.WithLabelValues(types.ID(m.To).String()).Inc()
                continue
            }

            p.status.activate()//连接状态为连通active
            if m.Type == raftpb.MsgApp && p.followerStats != nil {
                p.followerStats.Succ(end.Sub(start))
            }
            if isMsgSnap(m) {//向底层raft-node状态机发送成功的消息
                p.raft.ReportSnapshot(m.To, raft.SnapshotFinish)
            }
            sentBytes.WithLabelValues(types.ID(m.To).String()).Add(float64(m.Size()))
        case <-p.stopc:
            return
        }
    }
}

// post POSTs a data payload to a url. Returns nil if the POST succeeds,
// error on any failure.
func (p *pipeline) post(data []byte) (err error) {
    u := p.picker.pick()//获取对端暴露的url地址
    //创建HTTP POST请求的Request
    req := createPostRequest(u, RaftPrefix, bytes.NewBuffer(data), "application/protobuf", p.tr.URLs, p.tr.ID, p.tr.ClusterID)

    done := make(chan struct{}, 1)//通知下面的goroutine请求是否已经发送完成
    ctx, cancel := context.WithCancel(context.Background())
    req = req.WithContext(ctx)
    go func() {//该goroutine主要用于监听请求是否需要取消
        select {
        case <-done:
        case <-p.stopc://如果请求过程中,pipeline被关闭,则取消该请求
            waitSchedule()
            cancel()//取消请求
        }
    }()

    resp, err := p.tr.pipelineRt.RoundTrip(req)//发送上述HTTP POST请求,并获取到对应的的响应
    done <- struct{}{}//通知上述goroutine,请求已经发送完毕
    if err != nil {
        p.picker.unreachable(u)
        return err
    }
    defer resp.Body.Close()
    b, err := ioutil.ReadAll(resp.Body)//读取HTTP Response.Body内容
    if err != nil {
        p.picker.unreachable(u)//出现异常则将该URL标识为不可用
        return err
    }

    err = checkPostResponse(resp, b, req, p.peerID)//检测响应的内容
    if err != nil {
        p.picker.unreachable(u)
        // errMemberRemoved is a critical error since a removed member should
        // always be stopped. So we use reportCriticalError to report it to errorc.
        if err == errMemberRemoved {
            reportCriticalError(err, p.errorc)
        }
        return err
    }

    return nil
}

…省略其它代码

接收消息流程

  1. 实现ServeHTTP方法,
func newPipelineHandler(t *Transport, r Raft, cid types.ID) http.Handler {
	…省略其它代码
}
func (h *snapshotHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
	…省略其它代码
//TODO 调用Process 同步raft状态机
    if err := h.r.Process(context.TODO(), m); err != nil {
		…省略其它代码
	}
…省略其它代码
	
}

Raft接口
etcd/etcdserver/api/rafthttp/transport.go

type Raft interface {
    Process(ctx context.Context, m raftpb.Message) error  //将指定消息传递到etcd-raft模块进行处理
    IsIDRemoved(id uint64) bool                           //检测当前节点是否从当前集群中被移除
    ReportUnreachable(id uint64)                          //通知底层的etcd-raft模块,当前节点与指定的节点无法连通
    ReportSnapshot(id uint64, status raft.SnapshotStatus) //通知底层的etcd-raft模块,快照数据是否发送成功
}

2.EtcdServer是核心结构体,实现了transport里面的Raft接口,然后再通过自已实现的process方法调用底层raft状态机Step方法(整体的逻辑都在这个结构体,具体实现再通过其它子模块来实现)
etcd/etcdserver/server.go

func (s *EtcdServer) Start() {
    s.start()
     …省略其它代码
}
func (s *EtcdServer) start() {
	…省略其它代码
	go s.run()

}

func (s *EtcdServer) run() {
	…省略其它代码
	//真正去启动raft
	s.r.start(rh)
	…省略其它代码
}

func (s *EtcdServer) Process(ctx context.Context, m raftpb.Message) error {
    …省略其它代码
    return s.r.Step(ctx, m)
}

func (s *EtcdServer) IsIDRemoved(id uint64) bool { return s.cluster.IsIDRemoved(types.ID(id)) }

func (s *EtcdServer) ReportUnreachable(id uint64) { s.r.ReportUnreachable(id) }



节点投票过程

  1. raft结构体实现的是状态机的核心逻辑,具体细节点部分得再开一篇文章来回顾了

etcd/raft/raft.go

//TODO 节点投票过程
func (r *raft) Step(m pb.Message) error {
    // Handle the message term, which may result in our stepping down to a follower.
    switch {
    case m.Term == 0:
        // local message
    case m.Term > r.Term: //例如参与选举的Term值会比当前未参与的值大
        //当节点(无论是什么角色,包括上一届Leader,Follower,Candidate)收到Term比自己任期号大,
        //并且消息类型是MsgApp、MsgHeartbeat、MsgSnap类型的消息都会调用becomeFollower(m.Term,m.From),
        //都会将当前节点的状态切换成Follower,并进行相关状态的初始化

        if m.Type == pb.MsgVote || m.Type == pb.MsgPreVote {
            //根据消息的Context字段判断收到的MsgPreVote(或MsgVote)消息是否为Leader
            //节点转移场景下产生的,如果是,则强制当前节点参与本次预选(或选举)
            force := bytes.Equal(m.Context, []byte(campaignTransfer))
            //检测集群是否开启CheckQuorum模式,当前节点是否有已知的Lead节点,以及其选举计时器的时间
            inLease := r.checkQuorum && r.lead != None && r.electionElapsed < r.electionTimeout
            if !force && inLease { //满足此条件,该节点不参与此次选举
                return nil
            }
        }
        switch { //在这个switch中,当前节点会根据消息类型决定是否切换状态
        case m.Type == pb.MsgPreVote: //收到MsgPreVote消息时,不会引起当前节点的状态切换
            // Never change our term in response to a PreVote
        case m.Type == pb.MsgPreVoteResp && !m.Reject:
        default:
            r.logger.Infof("%x [term: %d] received a %s message with higher term from %x [term: %d]",
                r.id, r.Term, m.Type, m.From, m.Term)
            if m.Type == pb.MsgApp || m.Type == pb.MsgHeartbeat || m.Type == pb.MsgSnap {
                r.becomeFollower(m.Term, m.From)
            } else {
                r.becomeFollower(m.Term, None)
            }
        }

    case m.Term < r.Term:
        if (r.checkQuorum || r.preVote) && (m.Type == pb.MsgHeartbeat || m.Type == pb.MsgApp) {
           
            r.send(pb.Message{To: m.From, Type: pb.MsgAppResp})
        } else if m.Type == pb.MsgPreVote {
            r.send(pb.Message{To: m.From, Term: r.Term, Type: pb.MsgPreVoteResp, Reject: true})
        } else {
            // ignore other cases
            r.logger.Infof("%x [term: %d] ignored a %s message with lower term from %x [term: %d]",
                r.id, r.Term, m.Type, m.From, m.Term)
        }
        return nil
    }

    switch m.Type {
    case pb.MsgHup: //推动选举(Flower转成PreCandidate发送的消息)
        if r.state != StateLeader { //只有非Leader状态的节点才会处理MsgHup消息
            //检查是否有未执行的配置变更,大致就是先取出可提交还未执行的这一段,
            //然后检查里面是否有是变更集群配置的消息,如果有则直接return不进入candidate状态。
            if !r.promotable() {
                r.logger.Warningf("%x is unpromotable and can not campaign; ignoring MsgHup", r.id)
                return nil
            }
            //获取raftLog中已提交但未应用的Entry记录
            ents, err := r.raftLog.slice(r.raftLog.applied+1, r.raftLog.committed+1, noLimit)
            if err != nil {
                r.logger.Panicf("unexpected error getting unapplied entries (%v)", err)
            }
            //检测是否有未应用的EntryConfChange记录,如果有就放弃发起选举的机会
            if n := numOfPendingConf(ents); n != 0 && r.raftLog.committed > r.raftLog.applied {
                r.logger.Warningf("%x cannot campaign at term %d since there are still %d pending configuration changes to apply", r.id, r.Term, n)
                return nil
            }
            //进入选举
            r.logger.Infof("%x is starting a new election at term %d", r.id, r.Term)
            if r.preVote {
                //检测当前集群是否开启了PreVote模式,如果开启了
                //调用raft.campaign()方法切换当前节点的角色,发起PreVote
                r.campaign(campaignPreElection)
            } else {
                r.campaign(campaignElection)
            }
        } else { //如果当前节点已经是Leader状态,则仅仅输出一条Debug日志
            r.logger.Debugf("%x ignoring MsgHup because already leader", r.id)
        }

    case pb.MsgVote, pb.MsgPreVote: //投票,预投票消息处理
        // We can vote if this is a repeat of a vote we've already cast...
        // 初步判断是否可以投票
        //1. 如果自身记录的Vote值和消息的来源者相同,说明是条重复消息
        //2. 如果自身尚未投票,且当前没有leader,则可以投。
        canVote := r.Vote == m.From ||
            (r.Vote == None && r.lead == None) ||
            (m.Type == pb.MsgPreVote && m.Term > r.Term)
        //与本地最新的持久化日志比较
        if canVote && r.raftLog.isUpToDate(m.Index, m.LogTerm) {
            //TODO 发送投票信息
            //判断成功,则把票回复该节点,把票投给它。自身记录Vote,并重设election的计数器。
            r.send(pb.Message{To: m.From, Term: m.Term, Type: voteRespMsgType(m.Type)})
            if m.Type == pb.MsgVote { //如果是MsgVote处理
                // Only record real votes.
                r.electionElapsed = 0
                r.Vote = m.From
            }
        } else {
            //否则回复拒绝投票给该节点
            r.send(pb.Message{To: m.From, Term: r.Term, Type: voteRespMsgType(m.Type), Reject: true})
        }

    default:
        err := r.step(r, m)
        if err != nil {
            return err
        }
    }
    return nil
}

参照
https://raft.github.io/
https://blog.csdn.net/skh2015java/category_9284671.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值