这几天使用etcd的订阅功能遇到了一点问题,之前一直没注意这部分功能,所以抽了点时间把这部分流程看了看。
Watch是监听一组或者一个的key,key的任何变化都会发出消息。KV接口的具体实现是store结构体。Watch的实现是在store上封装了一层,叫做:watchableStore,重写了store的Write方法。
通过MVCC中介绍,store的任何写操作,都需要Write方法返回的TxnWrite。所以这里重写Write方法意味这任何写操作都会经过watchableStore。可以在之前的文章mvcc部分,最直观的入口就是put函数:
func (wv *writeView) Put(key, value []byte, lease lease.LeaseID) (rev int64) {
tw := wv.kv.Write()
defer tw.End()
return tw.Put(key, value, lease)
}
因为watch就是在store外面套了一层,可以看下面的Write函数。因此当put结束后调用End()函数式必然走到watch的End中:
func (tw *watchableStoreTxnWrite) End() {
changes := tw.Changes()
if len(changes) == 0 {
tw.TxnWrite.End()
return
}
rev := tw.Rev() + 1
evs := make([]mvccpb.Event, len(changes))
for i, change := range changes {
evs[i].Kv = &changes[i]
if change.CreateRevision == 0 {
evs[i].Type = mvccpb.DELETE
evs[i].Kv.ModRevision = rev
} else {
evs[i].Type = mvccpb.PUT
}
}
// end write txn under watchable store lock so the updates are visible
// when asynchronous event posting checks the current store revision
tw.s.mu.Lock()
tw.s.notify(rev, evs)//进入watch流程
tw.TxnWrite.End()//继续正常的事务处理
tw.s.mu.Unlock()
}
type watchableStoreTxnWrite struct {
TxnWrite
s *watchableStore
}
func (s *watchableStore) Write() TxnWrite { return &watchableStoreTxnWrite{s.store.Write(), s} }
atchableStoreTxnWrite在事务提交时,先将本次变更changes打包成Event,然后调用notify来将变更通知出去。最后真正提交事务TxnWrite.End(),现在待推送的消息(Event)已经通过notify方法进入到了Watch机制中
(1)、Event事件。变更的消息是以Event的形式发送出去的,Event包括KeyValue,同时包括操作类型(Put、Delete等)
(2)、watcher监听一个或一组key,如果有变更,watcher将变更内容通过chan发送出去。
(3)、watcherGroup管理多个watcher,能够根据key快速找到监听该key的一个或多个watcher。
(4)、watchableStore继承自store,在store基础上实现了watch功能。watchableStore管理着两个watcherGroup:synced、unsynced,和一个用于缓存的victims。victims是缓存当前未发出去的Event。
(5)、watchStream是对watchableStore的封装。因为watchableStore继承自store,所以他实现了很多方法,但这些方法并不都是用于Watch功能。所以watchStream对watchableStore再次封装,暴露出与Watch有关的方法。
func (s *watchableStore) notify(rev int64, evs []mvccpb.Event) {
var victim watcherBatch
//找到所有的watch,synced使用了map和红黑树来快速找到监听的key
for w, eb := range newWatcherBatch(&s.synced, evs) {
if eb.revs != 1 {
plog.Panicf("unexpected multiple revisions in notification")
}
//通过watch的send函数调用chan将消息发送出去
if w.send(WatchResponse{WatchID: w.id, Events: eb.evs, Revision: rev}) {
pendingEventsGauge.Add(float64(len(eb.evs)))
} else {
// move slow watcher to victims
w.minRev = rev + 1
if victim == nil {
victim = make(watcherBatch)
}
w.victim = true
victim[w] = eb
s.synced.delete(w)
slowWatcherGauge.Inc()
}
}
s.addVictim(victim)//将因为chan满没发出的消息缓存,然后使用unsynced再将消息发送出去
}
watcher后,调用watcher的send()方法,将变更的Event发送出去:
func (w *watcher) send(wr WatchResponse) bool {
progressEvent := len(wr.Events) == 0
if len(w.fcs) != 0 {
ne := make([]mvccpb.Event, 0, len(wr.Events))
for i := range wr.Events {
filtered := false
for _, filter := range w.fcs {
if filter(wr.Events[i]) {
filtered = true
break
}
}
if !filtered {
ne = append(ne, wr.Events[i])
}
}
wr.Events = ne
}
// if all events are filtered out, we should send nothing.
if !progressEvent && len(wr.Events) == 0 {
return true
}
select {
case w.ch <- wr:
return true
default:
return false
}
}
synced是使用了map和adt(红黑树)来快速找到符合监听条件的key。不单独使用map是因为watch可以监听一个范围的key。
上面说了使用victims来存储没有发送出去的消息,watcher会记录当前的Revision,并将自身标记为受损的。此次的变更操作会被保存到watchableStore的victims中。同时该watcher会被从synced踢出。例如,某个key被加入victims中,接下来对这个key任何变更,该watcher都不会记录。那这些消息就都丢掉了吗?当然不是,watcher变成受损状态时记录下了当时的Revision,后续会将从该Revision变化的情况都通知到用户。
我们在创建watchableStore时,会同时启动两个工作协程
go s.syncWatchersLoop()
go s.syncVictimsLoop()
第一个协程用于将unsynced的watcher同步为synced。
第二个协程用于循环清除watchableStore中的victims,上面说到队列满时,当时变更的Event被放入了victims中。这个协程就会试图清除这个Event。怎么清除呢?协程会不断尝试让watcher发送这个Event,一旦队列不满,watcher将这个Event发出后。该watcher就被划入了unsycned中,同时不再是受损状态。一旦进入unsycned后,syncWatchersLoop协程就开始起作用。由于在受损状态下,这个watcher已经错过了很多消息。为了追回进度,协程会根据watcher保存的Revision,找出受损之后所有的消息,将关于这个key的消息全部给watcher,当watcher将这些消息都发送出去后。watcher就脱离了unsynced,成为了synced,至此就解决了Chan满导致的问题。但是这里有个问题,假如此时该节点宕机,该如何处理?订阅的消息不就丢失了吗?
接下来我们继续看正常流程,上面说到WatchResponse通过chan发送出去,但是谁接收了这个消息呢?这就是前面说的watchStream结构要做的事,函数比较简单:
func (ws *watchStream) Watch(key, end []byte, startRev int64, fcs ...FilterFunc) WatchID {
// prevent wrong range where key >= end lexicographically
// watch request with 'WithFromKey' has empty-byte range end
if len(end) != 0 && bytes.Compare(key, end) != -1 {
return -1
}
ws.mu.Lock()
defer ws.mu.Unlock()
if ws.closed {
return -1
}
id := ws.nextID
ws.nextID++
w, c := ws.watchable.watch(key, end, startRev, id, ws.ch, fcs...)
ws.cancels[id] = c
ws.watchers[id] = w
return id
}
func (ws *watchServer) Watch(stream pb.Watch_WatchServer) (err error) {
sws := serverWatchStream{
clusterID: ws.clusterID,
memberID: ws.memberID,
raftTimer: ws.raftTimer,
watchable: ws.watchable,
gRPCStream: stream,
watchStream: ws.watchable.NewWatchStream(),//创建watchStream对象
// chan for sending control response like watcher created and canceled.
ctrlStream: make(chan *pb.WatchResponse, ctrlStreamBufLen),
progress: make(map[mvcc.WatchID]bool),
prevKV: make(map[mvcc.WatchID]bool),
closec: make(chan struct{}),
ag: ws.ag,
}
sws.wg.Add(1)
go func() {
sws.sendLoop()
sws.wg.Done()
}()
errc := make(chan error, 1)
// Ideally recvLoop would also use sws.wg to signal its completion
// but when stream.Context().Done() is closed, the stream's recv
// may continue to block since it uses a different context, leading to
// deadlock when calling sws.close().
go func() {
if rerr := sws.recvLoop(); rerr != nil {
if isClientCtxErr(stream.Context().Err(), rerr) {
plog.Debugf("failed to receive watch request from gRPC stream (%q)", rerr.Error())
} else {
plog.Warningf("failed to receive watch request from gRPC stream (%q)", rerr.Error())
}
errc <- rerr
}
}()
select {
case err = <-errc:
close(sws.ctrlStream)
case <-stream.Context().Done():
err = stream.Context().Err()
// the only server-side cancellation is noleader for now.
if err == context.Canceled {
err = rpctypes.ErrGRPCNoLeader
}
}
sws.close()
return err
}
通过chan发送过来的消息最终在这里被接收sws.watchStream.Chan():
func (sws *serverWatchStream) sendLoop() {
// watch ids that are currently active
ids := make(map[mvcc.WatchID]struct{})
// watch responses pending on a watch id creation message
pending := make(map[mvcc.WatchID][]*pb.WatchResponse)
interval := GetProgressReportInterval()
progressTicker := time.NewTicker(interval)
defer func() {
progressTicker.Stop()
// drain the chan to clean up pending events
for ws := range sws.watchStream.Chan() {
mvcc.ReportEventReceived(len(ws.Events))
}
for _, wrs := range pending {
for _, ws := range wrs {
mvcc.ReportEventReceived(len(ws.Events))
}
}
}()
for {
select {
case wresp, ok := <-sws.watchStream.Chan():
if !ok {
return
}
// TODO: evs is []mvccpb.Event type
// either return []*mvccpb.Event from the mvcc package
// or define protocol buffer with []mvccpb.Event.
evs := wresp.Events
events := make([]*mvccpb.Event, len(evs))
sws.mu.Lock()
needPrevKV := sws.prevKV[wresp.WatchID]
sws.mu.Unlock()
for i := range evs {
events[i] = &evs[i]
if needPrevKV {
opt := mvcc.RangeOptions{Rev: evs[i].Kv.ModRevision - 1}
r, err := sws.watchable.Range(evs[i].Kv.Key, nil, opt)
if err == nil && len(r.KVs) != 0 {
events[i].PrevKv = &(r.KVs[0])
}
}
}
canceled := wresp.CompactRevision != 0
wr := &pb.WatchResponse{
Header: sws.newResponseHeader(wresp.Revision),
WatchId: int64(wresp.WatchID),
Events: events,
CompactRevision: wresp.CompactRevision,
Canceled: canceled,
}
if _, hasId := ids[wresp.WatchID]; !hasId {
// buffer if id not yet announced
wrs := append(pending[wresp.WatchID], wr)
pending[wresp.WatchID] = wrs
continue
}
mvcc.ReportEventReceived(len(evs))
if err := sws.gRPCStream.Send(wr); err != nil {
if isClientCtxErr(sws.gRPCStream.Context().Err(), err) {
plog.Debugf("failed to send watch response to gRPC stream (%q)", err.Error())
} else {
plog.Warningf("failed to send watch response to gRPC stream (%q)", err.Error())
}
return
}
sws.mu.Lock()
if len(evs) > 0 && sws.progress[wresp.WatchID] {
// elide next progress update if sent a key update
sws.progress[wresp.WatchID] = false
}
sws.mu.Unlock()
case c, ok := <-sws.ctrlStream:
if !ok {
return
}
if err := sws.gRPCStream.Send(c); err != nil {
if isClientCtxErr(sws.gRPCStream.Context().Err(), err) {
plog.Debugf("failed to send watch control response to gRPC stream (%q)", err.Error())
} else {
plog.Warningf("failed to send watch control response to gRPC stream (%q)", err.Error())
}
return
}
// track id creation
wid := mvcc.WatchID(c.WatchId)
if c.Canceled {
delete(ids, wid)
continue
}
if c.Created {
// flush buffered events
ids[wid] = struct{}{}
for _, v := range pending[wid] {
mvcc.ReportEventReceived(len(v.Events))
if err := sws.gRPCStream.Send(v); err != nil {
if isClientCtxErr(sws.gRPCStream.Context().Err(), err) {
plog.Debugf("failed to send pending watch response to gRPC stream (%q)", err.Error())
} else {
plog.Warningf("failed to send pending watch response to gRPC stream (%q)", err.Error())
}
return
}
}
delete(pending, wid)
}
case <-progressTicker.C:
sws.mu.Lock()
for id, ok := range sws.progress {
if ok {
sws.watchStream.RequestProgress(id)
}
sws.progress[id] = true
}
sws.mu.Unlock()
case <-sws.closec:
return
}
}
}