ETCD源码分析---put流程

最新推荐文章于 2023-07-10 07:45:00 发布

souy_c

最新推荐文章于 2023-07-10 07:45:00 发布

阅读量952

点赞数

分类专栏： ETCD 文章标签： etcd

本文链接：https://blog.csdn.net/cyq6239075/article/details/108985051

版权

ETCD 专栏收录该内容

20 篇文章 7 订阅

订阅专栏

Put 流程涉及到的模块如下所示：

直接从processInternalRaftRequestOnce函数开始看：

func (s *EtcdServer) processInternalRaftRequestOnce(ctx context.Context, r pb.InternalRaftRequest) (*applyResult, error) {
    ai := s.getAppliedIndex()
    ci := s.getCommittedIndex()
    if ci > ai+maxGapBetweenApplyAndCommitIndex {
        return nil, ErrTooManyRequests
    }

    r.Header = &pb.RequestHeader{
        ID: s.reqIDGen.Next(),
    }

    authInfo, err := s.AuthInfoFromCtx(ctx)
    if err != nil {
        return nil, err
    }
    if authInfo != nil {
        r.Header.Username = authInfo.Username
        r.Header.AuthRevision = authInfo.Revision
    }

    data, err := r.Marshal()
    if err != nil {
        return nil, err
    }

    if len(data) > int(s.Cfg.MaxRequestBytes) {
        return nil, ErrRequestTooLarge
    }

    id := r.ID
    if id == 0 {
        id = r.Header.ID
    }
    ch := s.w.Register(id)

    cctx, cancel := context.WithTimeout(ctx, s.Cfg.ReqTimeout())
    defer cancel()

    start := time.Now()
    s.r.Propose(cctx, data)
    proposalsPending.Inc()
    defer proposalsPending.Dec()

    select {
    case x := <-ch:
        return x.(*applyResult), nil
    case <-cctx.Done():
        proposalsFailed.Inc()
        s.w.Trigger(id, nil) // GC wait
        return nil, s.parseProposeCtxErr(cctx.Err(), start)
    case <-s.done:
        return nil, ErrStopped
    }
}

processInternalRaftRequestOnce()函数中需要注意s.reqIDGen.Next()当该请求完成时根据该id唤醒后续处理。然后进入(n *node) Propose()函数，该函数将用户请求包装成pb.Message 结构，type类型是pb.MsgProp。进入(n *node) step（）函数，发送到propc 这个chan进行处理。

(n *node) run（）函数接收propc面发来的MsgProp消息，然后送入raft状态机处理(r *raft) Step（）。raft StateMachine 处理完这个 MsgProp Msg 会产生 1 个 Op log entry 和 2 个发送给另外两个副本的 Append entries 的 MsgApp messages，node 模块(n *node) run会将这两个输出打包成 Ready，通过readyc <- rd 提交到etcdserver处理。

(r *raftNode) start（）函数使用rd := <-n.Ready()接收readyc 传递过来的事件请求：
func (r *raftNode) start(rh *raftReadyHandler) {
    internalTimeout := time.Second

    go func() {
        defer r.onStop()
        islead := false

        for {
            select {
            case <-r.ticker.C:
                r.tick()
            case rd := <-r.Ready():
                if rd.SoftState != nil {
                    newLeader := rd.SoftState.Lead != raft.None && atomic.LoadUint64(&r.lead) != rd.SoftState.Lead
                    if newLeader {
                        leaderChanges.Inc()
                    }

                    if rd.SoftState.Lead == raft.None {
                        hasLeader.Set(0)
                    } else {
                        hasLeader.Set(1)
                    }

                    atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
                    islead = rd.RaftState == raft.StateLeader
                    if islead {
                        isLeader.Set(1)
                    } else {
                        isLeader.Set(0)
                    }
                    rh.updateLeadership(newLeader)
                    r.td.Reset()
                }

                if len(rd.ReadStates) != 0 {
                    select {
                    case r.readStateC <- rd.ReadStates[len(rd.ReadStates)-1]:
                    case <-time.After(internalTimeout):
                        plog.Warningf("timed out sending read state")
                    case <-r.stopped:
                        return
                    }
                }

                notifyc := make(chan struct{}, 1)
                ap := apply{
                    entries:  rd.CommittedEntries,
                    snapshot: rd.Snapshot,
                    notifyc:  notifyc,
                }

                updateCommittedIndex(&ap, rh)

                select {
                case r.applyc <- ap:
                case <-r.stopped:
                    return
                }

                // the leader can write to its disk in parallel with replicating to the followers and them
                // writing to their disks.
                // For more details, check raft thesis 10.2.1
                if islead {
                    // gofail: var raftBeforeLeaderSend struct{}
                    r.transport.Send(r.processMessages(rd.Messages))
                }

                // gofail: var raftBeforeSave struct{}
                if err := r.storage.Save(rd.HardState, rd.Entries); err != nil {
                    plog.Fatalf("raft save state and entries error: %v", err)
                }
                if !raft.IsEmptyHardState(rd.HardState) {
                    proposalsCommitted.Set(float64(rd.HardState.Commit))
                }
                // gofail: var raftAfterSave struct{}

                if !raft.IsEmptySnap(rd.Snapshot) {
                    // gofail: var raftBeforeSaveSnap struct{}
                    if err := r.storage.SaveSnap(rd.Snapshot); err != nil {
                        plog.Fatalf("raft save snapshot error: %v", err)
                    }
                    // etcdserver now claim the snapshot has been persisted onto the disk
                    notifyc <- struct{}{}

                    // gofail: var raftAfterSaveSnap struct{}
                    r.raftStorage.ApplySnapshot(rd.Snapshot)
                    plog.Infof("raft applied incoming snapshot at index %d", rd.Snapshot.Metadata.Index)
                    // gofail: var raftAfterApplySnap struct{}
                }

                r.raftStorage.Append(rd.Entries)

                if !islead {
                    // finish processing incoming messages before we signal raftdone chan
                    msgs := r.processMessages(rd.Messages)

                    // now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots
                    notifyc <- struct{}{}

                    // Candidate or follower needs to wait for all pending configuration
                    // changes to be applied before sending messages.
                    // Otherwise we might incorrectly count votes (e.g. votes from removed members).
                    // Also slow machine's follower raft-layer could proceed to become the leader
                    // on its own single-node cluster, before apply-layer applies the config change.
                    // We simply wait for ALL pending entries to be applied for now.
                    // We might improve this later on if it causes unnecessary long blocking issues.
                    waitApply := false
                    for _, ent := range rd.CommittedEntries {
                        if ent.Type == raftpb.EntryConfChange {
                            waitApply = true
                            break
                        }
                    }
                    if waitApply {
                        // blocks until 'applyAll' calls 'applyWait.Trigger'
                        // to be in sync with scheduled config-change job
                        // (assume notifyc has cap of 1)
                        select {
                        case notifyc <- struct{}{}:
                        case <-r.stopped:
                            return
                        }
                    }

                    // gofail: var raftBeforeFollowerSend struct{}
                    r.transport.Send(msgs)
                } else {
                    // leader already processed 'MsgSnap' and signaled
                    notifyc <- struct{}{}
                }
                     
                r.Advance()//Ready实例的处理,处理完调用raft.node.Advance（）方法，通知raft模块此次Ready处理完成，raft模块更新相应信息（例如，己应用Entry的最大索引值）之后，可以继续返回Ready实例

            case <-r.stopped:
                return
            }
        }
    }()
}

raftNode对Ready实例中各个字段的处理：
softstate：1、更新raftNode.lead字段。2、根据leader节点的变化情况调用updateLeadership（）回调函数
readStates：readStateC通道
CommittedEntries：封装成apply实例，送入applyc通道
Snapshot：1、封装成apply实例，送入applyc通道。2、将快照数据保存到本地盘。3、保存到MemoryStorage中
Messages：1、目标节点不存在的，踢除。2、如果有多条msgAppresp消息，只保留最后一条。3、如果有msgSnap消息，送入raftNode.msgSnapC中。
Entries：保存到MemoryStorage中

· raftNode 模块的 coroutine 通过 readyc 读取到 Ready，首先通过网络层将 2 个 append entries 的 messages 发送给两个副本(PS:这里是异步发送的)；

· raftNode 模块的 coroutine 自己将 Op log entry 通过持久化层的 WAL 接口同步的写入 WAL 文件中 :err := r.storage.Save(rd.HardState, rd.Entries);

· 让raft模块把entry从unstable移动到storage中保存:

r.raftStorage.Append(rd.Entries)

· raftNode 模块的 coroutine 通过 advancec Channel 通知当前 Ready 已经处理完，请给我准备下一个带出的 raft StateMachine 输出Ready；

· 其他副本的返回 Append entries 的 response： MsgAppResp message，会通过 node 模块的接口经过 recvc Channel（case m := <-n.recvc:）提交给 node 模块的 coroutine；

· node 模块 coroutine 从 recvc Channel 读取到 MsgAppResp，然后提交给 raft StateMachine 处理。node 模块 coroutine 会驱动 raft StateMachine 得到关于这个 committedEntires，也就是一旦大多数副本返回了就可以 commit 了，node 模块 new 一个新的 Ready其包含了 committedEntries，通过 readyc Channel 传递给 raftNode 模块 coroutine 处理；

· raftNode 模块 coroutine 从 readyc Channel 中读取 Ready结构

· 取出已经 commit 的 committedEntries 通过 applyc 传递给另外一个 etcd server coroutine 处理

case ap := <-s.r.apply():

f := func(context.Context) { s.applyAll(&ep, &ap) }

sched.Schedule(f)

其会将每个apply 任务提交给 FIFOScheduler 调度异步处理，这个调度器可以保证 apply 任务按照顺序被执行，因为 apply 的执行是不能乱的；

· raftNode 模块的 coroutine 通过 advancec Channel 通知当前 Ready 已经处理完，请给我准备下一个待处理的 raft StateMachine 输出Ready；

· FIFOScheduler 调度执行 apply 已经提交的 committedEntries

· AppliedIndex 推进，通知 ReadLoop coroutine，满足 applied index>= commit index 的 read request 可以返回；

· server调用网络层接口返回 client 成功。

总之最终进入：applyEntryNormal（）函数：

func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) {
	shouldApplyV3 := false
	if e.Index > s.consistIndex.ConsistentIndex() {
		// set the consistent index of current executing entry
		s.consistIndex.setConsistentIndex(e.Index)//更新EtcdServer. consistindex记录的索引位
		shouldApplyV3 = true
	}
	defer s.setAppliedIndex(e.Index)//方法结束时更新EtcdServer.appliedindex字段记录的索引值
 
	// raft state machine may generate noop entry when leader confirmation.
	// skip it in advance to avoid some potential bug in the future
	if len(e.Data) == 0 {//空的Entry记录只会在Leader选举结束时出现
		select {
		case s.forceVersionC <- struct{}{}:
		default:
		}
		// promote lessor when the local member is leader and finished
		// applying all entries from the last term.
		if s.isLeader() {//如果当前节点为Leader，则晋升其lessor实例
			s.lessor.Promote(s.Cfg.electionTimeout())
		}
		return
	}
 
	var raftReq pb.InternalRaftRequest
	if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // 尝试将Entry.Data反序列化成InternalRaftRequest实例， InternalRaftRequest中封装了所有类型的Client请求
		var r pb.Request
		rp := &r
		//兼容性处理， 如采上述序列化失败，则将Entry.Date反序列化成pb.Request
		pbutil.MustUnmarshal(rp, e.Data)
		s.w.Trigger(r.ID, s.applyV2Request((*RequestV2)(rp)))//调用EtcdServer.applyV2Request（）方法进行处理
		return
	}
	if raftReq.V2 != nil {
		req := (*RequestV2)(raftReq.V2)
		s.w.Trigger(req.ID, s.applyV2Request(req))
		return
	}
 
	// do not re-apply applied entries.
	if !shouldApplyV3 {
		return
	}
    //下面是对v3版本请求的处理
	id := raftReq.ID
	if id == 0 {
		id = raftReq.Header.ID
	}
 
	var ar *applyResult
	needResult := s.w.IsRegistered(id)
	if needResult || !noSideEffect(&raftReq) {
		if !needResult && raftReq.Txn != nil {
			removeNeedlessRangeReqs(raftReq.Txn)
		}
		//调用applyV3.Apply （）方法处理该Entry，其中会根据请求的类型选择不同的方法进行处理
		ar = s.applyV3.Apply(&raftReq)
	}
 
	if ar == nil {
		return
	}
//返回结采ar(applyResult类型）为nil，直接返回,如采返回了ErrNoSpace错误，则表示底层的Backend已经没有足够的空间，如是第一次出现这种情
//况，则在后面立即启动一个后台goroutine，并调用EtcdServer.raftRequest（）方法发送AlarmRequest请求，当前其他节点收到该请求时， 会停止后续的PUT操作
	if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 {
		s.w.Trigger(id, ar)//将上述处理结果写入对应的通道中， 然后将对应通道关闭
		return
	}
 
	plog.Errorf("applying raft message exceeded backend quota")
	s.goAttach(func() {//第一次出现ErrNoSpace错误
		a := &pb.AlarmRequest{//创建AlarmRequest
			MemberID: uint64(s.ID()),
			Action:   pb.AlarmRequest_ACTIVATE,
			Alarm:    pb.AlarmType_NOSPACE,
		}
		s.raftRequest(s.ctx, pb.InternalRaftRequest{Alarm: a})//将AlarmRequest请求封装成MsgProp消息，发送到集群
		s.w.Trigger(id, ar)//将上述处理结果写入对应的通道中，然后将对应通过关闭
	})
}

applyEntryNormal 函数会唤醒等待的client。

souy_c

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
ETCD源码分析---put流程

Put 流程涉及到的模块如下所示：直接从processInternalRaftRequestOnce函数开始看：func (s *EtcdServer) processInternalRaftRequestOnce(ctx context.Context, r pb.InternalRaftRequest) (*applyResult, error) { ai := s.getAppliedIndex() ci := s.getCommittedIndex() if c
复制链接

扫一扫