参考
感谢 jingxindeyi 提供的思考方向。
titile: mit 6.824 RAFT 实验过程记录
author: jingxindeyi
url: https://blog.csdn.net/zslngu/article/details/118619372
问题描述
日志内容解释
start xx
表示 Start()方法接收到的command
[DEBUG] Start(): 10
apply xx
表示通过applyCh通知service
apply index:1 cmd:10
cfg.logs
的元素是节点(即0~5);
map[1:10 2:20]
表示节点通过applyCh发送的commandIndex和command.
如commandIndex为1的command为10。
如commandIndex为2的command为20。
cfg.logs:[map[1:10 2:20] map[1:10 2:20] map[1:10 2:20 4:1000] map[1:10 2:20] map[1:10 2:20]]
详细日志
Test (2B): no agreement if too many followers disconnect ...
23:03:33.601880 term[1] who[2] role[2] voteFor[2] next:[1 1 1 1 1] match:[0 0 0 0 0] log:[0]|| become leader
23:03:33.641373 term[1] who[2] role[2] voteFor[2] next:[1 1 2 1 1] match:[0 0 0 0 0] log:[1]|| [DEBUG] Start(): 10
23:03:33.658977 term[1] who[2] role[2] voteFor[2] next:[2 2 2 2 2] match:[0 0 1 0 0] log:[1]|| apply index:1 cmd:10
23:03:34.195905 term[1] who[2] role[2] voteFor[2] next:[2 2 3 2 2] match:[1 1 1 1 1] log:[2]|| [DEBUG] Start(): 20
//... 模拟网络波动,导致 2 联系不上 0 3 4
23:03:36.236427 term[1] who[2] role[2] voteFor[2] next:[3 3 3 3 3] match:[1 1 2 1 1] log:[2]|| apply index:2 cmd:20
23:03:36.726001 term[1] who[2] role[2] voteFor[2] next:[3 3 4 3 3] match:[2 2 2 2 2] log:[3]|| [DEBUG] Start(): 30
23:03:36.726059 term[1] who[2] role[2] voteFor[2] next:[3 3 5 3 3] match:[2 2 2 2 2] log:[4]|| [DEBUG] Start(): 1000
23:03:36.737508 term[1] who[2] role[2] voteFor[2] next:[5 5 5 5 5] match:[2 2 4 2 2] log:[4]|| apply index:4 cmd:1000
23:03:36.737594 cfg.logs:[map[1:10 2:20] map[1:10 2:20] map[1:10 2:20 4:1000] map[1:10 2:20] map[1:10 2:20]]
23:03:36.737613 apply error: server 2 apply out of order 4
exit status 1
FAIL 6.824/raft 3.566s
定位问题
根据日志map[1:10 2:20 4:1000]
可以看出问题出在:
节点2 通过applyCh发送的commandIndex为1,2,4。这是不正确的,3必须在4之前发送。
正确顺序的应该是1,2,3,4。
问题代码
func judgeAndCommit(rf *Raft) {
next := rf.CloneNext()
sort.Ints(next)
majorityNext := next[len(rf.peers)/2]
majorityCur := majorityNext - 1
le, err := rf.GetLog(majorityCur)
if err == nil {
// 注意:这里只有applyCh一次.
// 日志中得到的majorityCur是4, 而rf.GetMatch(rf.me)是2. 这里只发送了4,没有发送3.
if majorityCur > rf.GetMatch(rf.me) && le.Term == rf.St.GetCurrentTerm() {
rf.SetMatch(rf.me, majorityCur)
DPrintf("%v apply index:%v cmd:%v", LogBaseInfo(rf), majorityCur, le.Command)
rf.applyCh <- ApplyMsg{
CommandValid: true,
Command: le.Command,
CommandIndex: majorityCur,
}
}
}
}
问题解决
更新代码
增加一个for循环,将majorityCur与rf.GetMatch(rf.me)之间缺少的command依次提交。 最后更新代码如下
func judgeAndCommit(rf *Raft) {
next := rf.CloneNext()
sort.Ints(next)
majorityNext := next[len(rf.peers)/2]
majorityCur := majorityNext - 1
// 增加了一个for循环依次提交command
for cmtIndx := rf.GetMatch(rf.me) + 1; cmtIndx <= majorityCur; cmtIndx++ {
le, err := rf.GetLog(cmtIndx)
if err == nil {
if cmtIndx > rf.GetMatch(rf.me) && le.Term == rf.St.GetCurrentTerm() {
rf.SetMatch(rf.me, cmtIndx)
DPrintf("%v apply index:%v cmd:%v", LogBaseInfo(rf), cmtIndx, le.Command)
rf.applyCh <- ApplyMsg{
CommandValid: true,
Command: le.Command,
CommandIndex: cmtIndx,
}
}
}
}
}
单元测试
func TestApplyOutOfOrder(t *testing.T) {
rf := prepareRaft(5)
rf.me = 3
rf.nextIndex = []int{5, 5, 5, 5, 5}
rf.matchIndex = []int{2, 2, 2, 2, 2}
rf.St.Log = []LogEntry{NewLogEntry(1, 10), NewLogEntry(1, 20), NewLogEntry(1, 30), NewLogEntry(1, 40)}
rf.St.SetCurrentTerm(1)
receiveCmds := []interface{}{}
wg := sync.WaitGroup{}
wg.Add(2)
go func() {
for msg := range rf.applyCh {
wg.Done()
receiveCmds = append(receiveCmds, msg.Command)
}
}()
judgeAndCommit(rf)
wg.Wait()
expectedCmds := []interface{}{30, 40}
if len(expectedCmds) != len(receiveCmds) {
AssertInt(t, len(expectedCmds), len(receiveCmds), "TestApplyOutOfOrder")
}
for i := 0; i < len(expectedCmds); i++ {
AssertInterface(t, expectedCmds[i], receiveCmds[i], "TestApplyOutOfOrder")
}
}
func prepareRaft(n int) *Raft {
rf := Raft{}
rf.applyCh = make(chan ApplyMsg)
for i := 0; i < n; i++ {
rf.peers = append(rf.peers, &labrpc.ClientEnd{})
}
rf.St = State{}
return &rf
}
func AssertInt(t *testing.T, expected, actual int, msg string) {
if expected != actual {
t.Errorf("%v: FAIL", msg)
t.Errorf("expected: %v", expected)
t.Errorf("actual: %v", actual)
t.Fail()
} else {
t.Logf("%v: PASS", msg)
}
}
func AssertInterface(t *testing.T, expected, actual interface{}, msg string) {
if expected != actual {
t.Errorf("%v: FAIL", msg)
t.Errorf("expected: %v", expected)
t.Errorf("actual: %v", actual)
t.Fail()
} else {
t.Logf("%v: PASS", msg)
}
}