Put 流程涉及到的模块如下所示:
直接从processInternalRaftRequestOnce函数开始看:
func (s *EtcdServer) processInternalRaftRequestOnce(ctx context.Context, r pb.InternalRaftRequest) (*applyResult, error) {
ai := s.getAppliedIndex()
ci := s.getCommittedIndex()
if ci > ai+maxGapBetweenApplyAndCommitIndex {
return nil, ErrTooManyRequests
}
r.Header = &pb.RequestHeader{
ID: s.reqIDGen.Next(),
}
authInfo, err := s.AuthInfoFromCtx(ctx)
if err != nil {
return nil, err
}
if authInfo != nil {
r.Header.Username = authInfo.Username
r.Header.AuthRevision = authInfo.Revision
}
data, err := r.Marshal()
if err != nil {
return nil, err
}
if len(data) > int(s.Cfg.MaxRequestBytes) {
return nil, ErrRequestTooLarge
}
id := r.ID
if id == 0 {
id = r.Header.ID
}
ch := s.w.Register(id)
cctx, cancel := context.WithTimeout(ctx, s.Cfg.ReqTimeout())
defer cancel()
start := time.Now()
s.r.Propose(cctx, data)
proposalsPending.Inc()
defer proposalsPending.Dec()
select {
case x := <-ch:
return x.(*applyResult), nil
case <-cctx.Done():
proposalsFailed.Inc()
s.w.Trigger(id, nil) // GC wait
return nil, s.parseProposeCtxErr(cctx.Err(), start)
case <-s.done:
return nil, ErrStopped
}
}
processInternalRaftRequestOnce()函数中需要注意s.reqIDGen.Next()当该请求完成时根据该id唤醒后续处理。然后进入(n *node) Propose()函数,该函数将用户请求包装成pb.Message 结构,type类型是pb.MsgProp。进入(n *node) step()函数,发送到propc 这个chan进行处理。
(n *node) run()函数接收propc面发来的MsgProp消息,然后送入raft状态机处理(r *raft) Step()。raft StateMachine 处理完这个 MsgProp Msg 会产生 1 个 Op log entry 和 2 个发送给另外两个副本的 Append entries 的 MsgApp messages,node 模块(n *node) run会将这两个输出打包成 Ready,通过readyc <- rd 提交到etcdserver处理。
(r *raftNode) start()函数使用rd := <-n.Ready()接收readyc 传递过来的事件请求:
func (r *raftNode) start(rh *raftReadyHandler) {
internalTimeout := time.Second
go func() {
defer r.onStop()
islead := false
for {
select {
case <-r.ticker.C:
r.tick()
case rd := <-r.Ready():
if rd.SoftState != nil {
newLeader := rd.SoftState.Lead != raft.None && atomic.LoadUint64(&r.lead) != rd.SoftState.Lead
if newLeader {
leaderChanges.Inc()
}
if rd.SoftState.Lead == raft.None {
hasLeader.Set(0)
} else {
hasLeader.Set(1)
}
atomic.StoreUint64(&r.lead, rd.SoftState.Lead)
islead = rd.RaftState == raft.StateLeader
if islead {
isLeader.Set(1)
} else {
isLeader.Set(0)
}
rh.updateLeadership(newLeader)
r.td.Reset()
}
if len(rd.ReadStates) != 0 {
select {
case r.readStateC <- rd.ReadStates[len(rd.ReadStates)-1]:
case <-time.After(internalTimeout):
plog.Warningf("timed out sending read state")
case <-r.stopped:
return
}
}
notifyc := make(chan struct{}, 1)
ap := apply{
entries: rd.CommittedEntries,
snapshot: rd.Snapshot,
notifyc: notifyc,
}
updateCommittedIndex(&ap, rh)
select {
case r.applyc <- ap:
case <-r.stopped:
return
}
// the leader can write to its disk in parallel with replicating to the followers and them
// writing to their disks.
// For more details, check raft thesis 10.2.1
if islead {
// gofail: var raftBeforeLeaderSend struct{}
r.transport.Send(r.processMessages(rd.Messages))
}
// gofail: var raftBeforeSave struct{}
if err := r.storage.Save(rd.HardState, rd.Entries); err != nil {
plog.Fatalf("raft save state and entries error: %v", err)
}
if !raft.IsEmptyHardState(rd.HardState) {
proposalsCommitted.Set(float64(rd.HardState.Commit))
}
// gofail: var raftAfterSave struct{}
if !raft.IsEmptySnap(rd.Snapshot) {
// gofail: var raftBeforeSaveSnap struct{}
if err := r.storage.SaveSnap(rd.Snapshot); err != nil {
plog.Fatalf("raft save snapshot error: %v", err)
}
// etcdserver now claim the snapshot has been persisted onto the disk
notifyc <- struct{}{}
// gofail: var raftAfterSaveSnap struct{}
r.raftStorage.ApplySnapshot(rd.Snapshot)
plog.Infof("raft applied incoming snapshot at index %d", rd.Snapshot.Metadata.Index)
// gofail: var raftAfterApplySnap struct{}
}
r.raftStorage.Append(rd.Entries)
if !islead {
// finish processing incoming messages before we signal raftdone chan
msgs := r.processMessages(rd.Messages)
// now unblocks 'applyAll' that waits on Raft log disk writes before triggering snapshots
notifyc <- struct{}{}
// Candidate or follower needs to wait for all pending configuration
// changes to be applied before sending messages.
// Otherwise we might incorrectly count votes (e.g. votes from removed members).
// Also slow machine's follower raft-layer could proceed to become the leader
// on its own single-node cluster, before apply-layer applies the config change.
// We simply wait for ALL pending entries to be applied for now.
// We might improve this later on if it causes unnecessary long blocking issues.
waitApply := false
for _, ent := range rd.CommittedEntries {
if ent.Type == raftpb.EntryConfChange {
waitApply = true
break
}
}
if waitApply {
// blocks until 'applyAll' calls 'applyWait.Trigger'
// to be in sync with scheduled config-change job
// (assume notifyc has cap of 1)
select {
case notifyc <- struct{}{}:
case <-r.stopped:
return
}
}
// gofail: var raftBeforeFollowerSend struct{}
r.transport.Send(msgs)
} else {
// leader already processed 'MsgSnap' and signaled
notifyc <- struct{}{}
}
r.Advance()//Ready实例的处理,处理完调用raft.node.Advance()方法,通知raft模块此次Ready处理完成,raft模块更新相应信息(例如,己应用Entry的最大索引值)之后,可以继续返回Ready实例
case <-r.stopped:
return
}
}
}()
}
raftNode对Ready实例中各个字段的处理:
softstate:1、更新raftNode.lead字段。2、根据leader节点的变化情况调用updateLeadership()回调函数
readStates:readStateC通道
CommittedEntries:封装成apply实例,送入applyc通道
Snapshot:1、封装成apply实例,送入applyc通道。2、将快照数据保存到本地盘。3、保存到MemoryStorage中
Messages:1、目标节点不存在的,踢除。2、如果有多条msgAppresp消息,只保留最后一条。3、如果有msgSnap消息,送入raftNode.msgSnapC中。
Entries:保存到MemoryStorage中
· raftNode 模块的 coroutine 通过 readyc 读取到 Ready,首先通过网络层将 2 个 append entries 的 messages 发送给两个副本(PS:这里是异步发送的);
· raftNode 模块的 coroutine 自己将 Op log entry 通过持久化层的 WAL 接口同步的写入 WAL 文件中 :err := r.storage.Save(rd.HardState, rd.Entries);
· 让raft模块把entry从unstable
移动到storage
中保存:
r.raftStorage.Append(rd.Entries)
· raftNode 模块的 coroutine 通过 advancec Channel 通知当前 Ready 已经处理完,请给我准备下一个 带出的 raft StateMachine 输出Ready;
· 其他副本的返回 Append entries 的 response: MsgAppResp message,会通过 node 模块的接口经过 recvc Channel(case m := <-n.recvc:) 提交给 node 模块的 coroutine;
· node 模块 coroutine 从 recvc Channel 读取到 MsgAppResp,然后提交给 raft StateMachine 处理。node 模块 coroutine 会驱动 raft StateMachine 得到关于这个 committedEntires,也就是一旦大多数副本返回了就可以 commit 了,node 模块 new 一个新的 Ready其包含了 committedEntries,通过 readyc Channel 传递给 raftNode 模块 coroutine 处理;
· raftNode 模块 coroutine 从 readyc Channel 中读取 Ready结构
· 取出已经 commit 的 committedEntries 通过 applyc 传递给另外一个 etcd server coroutine 处理
case ap := <-s.r.apply():
f := func(context.Context) { s.applyAll(&ep, &ap) }
sched.Schedule(f)
其会将每个apply 任务提交给 FIFOScheduler 调度异步处理,这个调度器可以保证 apply 任务按照顺序被执行,因为 apply 的执行是不能乱的;
· raftNode 模块的 coroutine 通过 advancec Channel 通知当前 Ready 已经处理完,请给我准备下一个待处理的 raft StateMachine 输出Ready;
· FIFOScheduler 调度执行 apply 已经提交的 committedEntries
· AppliedIndex 推进,通知 ReadLoop coroutine,满足 applied index>= commit index 的 read request 可以返回;
· server调用网络层接口返回 client 成功。
总之最终进入:applyEntryNormal()函数:
func (s *EtcdServer) applyEntryNormal(e *raftpb.Entry) {
shouldApplyV3 := false
if e.Index > s.consistIndex.ConsistentIndex() {
// set the consistent index of current executing entry
s.consistIndex.setConsistentIndex(e.Index)//更新EtcdServer. consistindex记录的索引位
shouldApplyV3 = true
}
defer s.setAppliedIndex(e.Index)//方法结束时更新EtcdServer.appliedindex字段记录的索引值
// raft state machine may generate noop entry when leader confirmation.
// skip it in advance to avoid some potential bug in the future
if len(e.Data) == 0 {//空的Entry记录只会在Leader选举结束时出现
select {
case s.forceVersionC <- struct{}{}:
default:
}
// promote lessor when the local member is leader and finished
// applying all entries from the last term.
if s.isLeader() {//如果当前节点为Leader,则晋升其lessor实例
s.lessor.Promote(s.Cfg.electionTimeout())
}
return
}
var raftReq pb.InternalRaftRequest
if !pbutil.MaybeUnmarshal(&raftReq, e.Data) { // 尝试将Entry.Data反序列化成InternalRaftRequest实例, InternalRaftRequest中封装了所有类型的Client请求
var r pb.Request
rp := &r
//兼容性处理, 如采上述序列化失败,则将Entry.Date反序列化成pb.Request
pbutil.MustUnmarshal(rp, e.Data)
s.w.Trigger(r.ID, s.applyV2Request((*RequestV2)(rp)))//调用EtcdServer.applyV2Request()方法进行处理
return
}
if raftReq.V2 != nil {
req := (*RequestV2)(raftReq.V2)
s.w.Trigger(req.ID, s.applyV2Request(req))
return
}
// do not re-apply applied entries.
if !shouldApplyV3 {
return
}
//下面是对v3版本请求的处理
id := raftReq.ID
if id == 0 {
id = raftReq.Header.ID
}
var ar *applyResult
needResult := s.w.IsRegistered(id)
if needResult || !noSideEffect(&raftReq) {
if !needResult && raftReq.Txn != nil {
removeNeedlessRangeReqs(raftReq.Txn)
}
//调用applyV3.Apply ()方法处理该Entry,其中会根据请求的类型选择不同的方法进行处理
ar = s.applyV3.Apply(&raftReq)
}
if ar == nil {
return
}
//返回结采ar(applyResult类型)为nil,直接返回,如采返回了ErrNoSpace错误,则表示底层的Backend已经没有足够的空间,如是第一次出现这种情
//况,则在后面立即启动一个后台goroutine,并调用EtcdServer.raftRequest()方法发送AlarmRequest请求,当前其他节点收到该请求时, 会停止后续的PUT操作
if ar.err != ErrNoSpace || len(s.alarmStore.Get(pb.AlarmType_NOSPACE)) > 0 {
s.w.Trigger(id, ar)//将上述处理结果写入对应的通道中, 然后将对应通道关闭
return
}
plog.Errorf("applying raft message exceeded backend quota")
s.goAttach(func() {//第一次出现ErrNoSpace错误
a := &pb.AlarmRequest{//创建AlarmRequest
MemberID: uint64(s.ID()),
Action: pb.AlarmRequest_ACTIVATE,
Alarm: pb.AlarmType_NOSPACE,
}
s.raftRequest(s.ctx, pb.InternalRaftRequest{Alarm: a})//将AlarmRequest请求封装成MsgProp消息,发送到集群
s.w.Trigger(id, ar)//将上述处理结果写入对应的通道中,然后将对应通过关闭
})
}
applyEntryNormal 函数会唤醒等待的client。