ETCD v2 源码分析

Etcd is a distributed, consistent key-valuestore for shared configurationand service discovery

ETCD 有 v2和 v3,api 和内部存储都不一样。这里只分析 v2。
基于源码git tag v2.0.0 :git checkout -b version2 v2.0.0

一致性协议使用 raft,raft 协议不在本文叙述范围内。raft 协议相关见ETCD - raft

v2版的 API 文档
etcd API - v2

ETCD 整体架构

在这里插入图片描述
Store 是内存kv数据库。存储所有kv数据。
HTTP Server接受来自客户端的请求。如果是数据变更的请求(PUT,POST,QGET(GET with quorum=true)等),Server 会将请求结构体序列化成一个raft Entry,然后提交Raft传播,等到raft集群达成一致以后(CommittedEntries),Raft先将CommittedEntries(或者快照)存储到WAL中并刷盘(用于重启的时候恢复raft日志),然后见调用
func (s *EtcdServer) apply(es []raftpb.Entry, confState *raftpb.ConfState)
使用CommittedEntries对Store的数据实施变更。每一条Entry都是对Store的操作命令。
EtcdServer接收raft通知的核心代码如下:


func (s *EtcdServer) run() {
   
	// TODO: make raft loop a method on raftNode
	for {
   
		select {
   
		case rd := <-s.r.Ready():
			if rd.SoftState != nil {
   
				atomic.StoreUint64(&s.r.lead, rd.SoftState.Lead)
				if rd.RaftState == raft.StateLeader {
   
					syncC = s.SyncTicker
					// TODO: remove the nil checking
					// current test utility does not provide the stats
					if s.stats != nil {
   
						s.stats.BecomeLeader()
					}
				} else {
   
					syncC = nil
				}
			}

			// apply snapshot to storage if it is more updated than current snapi
			if !raft.IsEmptySnap(rd.Snapshot) && rd.Snapshot.Metadata.Index > snapi {
   
				if err := s.r.storage.SaveSnap(rd.Snapshot); err != nil {
   
					log.Fatalf("etcdserver: save snapshot error: %v", err)
				}
				s.r.raftStorage.ApplySnapshot(rd.Snapshot)
				snapi = rd.Snapshot.Metadata.Index
				log.Printf("etcdserver: saved incoming snapshot at index %d", snapi)
			}

			if err := s.r.storage.Save(rd.HardState, rd.Entries); err != nil {
   
				log.Fatalf("etcdserver: save state and entries error: %v", err)
			}
			s.r.raftStorage.Append(rd.Entries)

			s.send(rd.Messages)

			// recover from snapshot if it is more updated than current applied
			if !raft.IsEmptySnap(rd.Snapshot) && rd.Snapshot.Metadata.Index > appliedi {
   
				if err := s.store.Recovery(rd.Snapshot.Data); err != nil {
   
					log.Panicf("recovery store error: %v", err)
				}
				s.Cluster.Recover()
				appliedi = rd.Snapshot.Metadata.Index
				log.Printf("etcdserver: recovered from incoming snapshot at index %d", snapi)
			}
			// TODO(bmizerany): do this in the background, but take
			// care to apply entries in a single goroutine, and not
			// race them.
			if len(rd.CommittedEntries) != 0 {
   
				firsti := rd.CommittedEntries[0].Index
				if firsti > appliedi+1 {
   
					log.Panicf("etcdserver: first index of committed entry[%d] should <= appliedi[%d] + 1", firsti, appliedi)
				}
				var ents []raftpb.Entry
				if appliedi+1-firsti < uint64(len(rd.CommittedEntries)) {
   
					ents = rd.CommittedEntries[appliedi+1-firsti:]
				}
				if len(ents) > 0 {
   
					//处理已经被raft集群提交的请求。
					if appliedi, shouldstop = s.apply(ents, &confState); shouldstop {
   
						go s.stopWithDelay(10*100*time.Millisecond, fmt.Errorf("the member has been permanently removed from the cluster"))
					}
				}
			}

			s.r.Advance()

			if appliedi-snapi > s.r.snapCount {
   
				log.Printf("etcdserver: start to snapshot (applied: %d, lastsnap: %d)", appliedi, snapi)
				s.snapshot(appliedi, &confState)
				snapi = appliedi
			}
		}
	}
}

raft部分

type raftNode struct {
   
	raft.Node

	// config
	snapCount uint64 // number of entries to trigger a snapshot

	// utility
	ticker      <-chan time.Time
	raftStorage *raft.MemoryStorage
	storage     Storage
	// transport specifies the transport to send and receive msgs to members.
	// Sending messages MUST NOT block. It is okay to drop messages, since
	// clients should timeout and reissue their messages.
	// If transport is nil, server will panic.
	transport rafthttp.Transporter

	// Cache of the latest raft index and raft term the server has seen
	index uint64
	term  uint64
	lead  uint64
}

storage接口是etcdserver.Storage,只有etcdserver.storage一种实现。该实现是WAL,磁盘预写式日志,只在raft节点重启恢复的时候用,读出之前存储的快照和entry列表。
raftStorage是rafgLog.storage, rafgLog.storage是 raft.Storage接口,实现只有raft.MemoryStorage一种,用来存储日志和快照, 也就是存储在内存中。

//etcdserver.Storage
type Storage interface {
   
	// Save function saves ents and state to the underlying stable storage.
	// Save MUST block until st and ents are on stable storage.
	Save(st raftpb.HardState, ents []raftpb.Entry) error
	// SaveSnap function saves snapshot to the underlying stable storage.
	SaveSnap(snap raftpb.Snapshot) error

	// TODO: WAL should be able to control cut itself. After implement self-controlled cut,
	// remove it in this interface.
	// Cut cuts out a new wal file for saving new state and entries.
	Cut() error
	// Close closes the Storage and performs finalization.
	Close() error
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值