目录
Etcd is a distributed, consistent key-valuestore for shared configurationand service discovery
ETCD 有 v2和 v3,api 和内部存储都不一样。这里只分析 v2。
基于源码git tag v2.0.0 :git checkout -b version2 v2.0.0
一致性协议使用 raft,raft 协议不在本文叙述范围内。raft 协议相关见ETCD - raft
v2版的 API 文档
etcd API - v2
ETCD 整体架构
Store 是内存kv数据库。存储所有kv数据。
HTTP Server接受来自客户端的请求。如果是数据变更的请求(PUT,POST,QGET(GET with quorum=true)等),Server 会将请求结构体序列化成一个raft Entry,然后提交Raft传播,等到raft集群达成一致以后(CommittedEntries),Raft先将CommittedEntries(或者快照)存储到WAL中并刷盘(用于重启的时候恢复raft日志),然后见调用
func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState)
使用CommittedEntries对Store的数据实施变更。每一条Entry都是对Store的操作命令。
EtcdServer接收raft通知的核心代码如下:
func (s *EtcdServer) run() {
// TODO: make raft loop a method on raftNode
for {
select {
case rd := <-s.r.Ready():
if rd.SoftState != nil {
atomic.StoreUint64(&s.r.lead, rd.SoftState.Lead)
if rd.RaftState == raft.StateLeader {
syncC = s.SyncTicker
// TODO: remove the nil checking
// current test utility does not provide the stats
if s.stats != nil {
s.stats.BecomeLeader()
}
} else {
syncC = nil
}
}
// apply snapshot to storage if it is more updated than current snapi
if !raft.IsEmptySnap(rd.Snapshot) && rd.Snapshot.Metadata.Index > snapi {
if err := s.r.storage.SaveSnap(rd.Snapshot); err != nil {
log.Fatalf("etcdserver: save snapshot error: %v", err)
}
s.r.raftStorage.ApplySnapshot(rd.Snapshot)
snapi = rd.Snapshot.Metadata.Index
log.Printf("etcdserver: saved incoming snapshot at index %d", snapi)
}
if err := s.r.storage.Save(rd.HardState, rd.Entries); err != nil {
log.Fatalf("etcdserver: save state and entries error: %v", err)
}
s.r.raftStorage.Append(rd.Entries)
s.send(rd.Messages)
// recover from snapshot if it is more updated than current applied
if !raft.IsEmptySnap(rd.Snapshot) && rd.Snapshot.Metadata.Index > appliedi {
if err := s.store.Recovery(rd.Snapshot.Data); err != nil {
log.Panicf("recovery store error: %v", err)
}
s.Cluster.Recover()
appliedi = rd.Snapshot.Metadata.Index
log.Printf("etcdserver: recovered from incoming snapshot at index %d", snapi)
}
// TODO(bmizerany): do this in the background, but take
// care to apply entries in a single goroutine, and not
// race them.
if len(rd.CommittedEntries) != 0 {
firsti := rd.CommittedEntries[0].Index
if firsti > appliedi+1 {
log.Panicf("etcdserver: first index of committed entry[%d] should <= appliedi[%d] + 1", firsti, appliedi)
}
var ents []raftpb.Entry
if appliedi+1-firsti < uint64(len(rd.CommittedEntries)) {
ents = rd.CommittedEntries[appliedi+1-firsti:]
}
if len(ents) > 0 {
//处理已经被raft集群提交的请求。
if appliedi, shouldstop = s.apply(ents, &confState); shouldstop {
go s.stopWithDelay(10*100*time.Millisecond, fmt.Errorf("the member has been permanently removed from the cluster"))
}
}
}
s.r.Advance()
if appliedi-snapi > s.r.snapCount {
log.Printf("etcdserver: start to snapshot (applied: %d, lastsnap: %d)", appliedi, snapi)
s.snapshot(appliedi, &confState)
snapi = appliedi
}
}
}
}
raft部分
type raftNode struct {
raft.Node
// config
snapCount uint64 // number of entries to trigger a snapshot
// utility
ticker <-chan time.Time
raftStorage *raft.MemoryStorage
storage Storage
// transport specifies the transport to send and receive msgs to members.
// Sending messages MUST NOT block. It is okay to drop messages, since
// clients should timeout and reissue their messages.
// If transport is nil, server will panic.
transport rafthttp.Transporter
// Cache of the latest raft index and raft term the server has seen
index uint64
term uint64
lead uint64
}
storage接口是etcdserver.Storage,只有etcdserver.storage一种实现。该实现是WAL,磁盘预写式日志,只在raft节点重启恢复的时候用,读出之前存储的快照和entry列表。
raftStorage是rafgLog.storage, rafgLog.storage是 raft.Storage接口,实现只有raft.MemoryStorage一种,用来存储日志和快照, 也就是存储在内存中。
//etcdserver.Storage
type Storage interface {
// Save function saves ents and state to the underlying stable storage.
// Save MUST block until st and ents are on stable storage.
Save(st raftpb.HardState, ents []raftpb.Entry) error
// SaveSnap function saves snapshot to the underlying stable storage.
SaveSnap(snap raftpb.Snapshot) error
// TODO: WAL should be able to control cut itself. After implement self-controlled cut,
// remove it in this interface.
// Cut cuts out a new wal file for saving new state and entries.
Cut() error
// Close closes the Storage and performs finalization.
Close() error
}
//实现Storage
//如果是entry,写入wal.WAL中;
//如果是快照,则快照原信息写入wal.WAL中,快照文件写入snap.Snapshotter表示的地址中。
type storage struct {
*wal.WAL
*snap.Snapshotter
}
type raft struct {
pb.HardState
id uint64
// the log
raftLog *raftLog
prs map