记一次线上nsq积压问题排查经历

最新推荐文章于 2023-03-09 13:59:53 发布

种花家的小白

最新推荐文章于 2023-03-09 13:59:53 发布

阅读量2.3k

点赞数

分类专栏： go

本文链接：https://blog.csdn.net/H_haow/article/details/103290532

版权

go 专栏收录该内容

6 篇文章 0 订阅

订阅专栏

现象

sre同事提醒线上环境nsq队列积压报警，具体表现为nsq下某个topic的其中一个channel消息积压严重，平常都是0的，现在积压到12万，初步怀疑，channel里面的数据没有被消费，或者消费不及时

登陆线上环境查看

curl http://nsqdipaddr:port/stats
```
   [xx         ] depth: 0     be-depth: 0     msgs: 26067    e2e%:
      [xx_nsq          ] depth: 25    be-depth: 25    inflt: 18   def: 0    re-q: 36420098 timeout: 0     msgs: 26067    e2e%:
        [V2 node17               ] state: 3 inflt: 6    rdy: 6    fin: 8746     re-q: 11142859 msgs: 11151611 connected: 503h50m18s
        [V2 node16               ] state: 3 inflt: 6    rdy: 6    fin: 8542     re-q: 11123032 msgs: 11131580 connected: 503h50m17s
        [V2 node15               ] state: 3 inflt: 6    rdy: 6    fin: 8395     re-q: 10965582 msgs: 10973983 connected: 503h50m17s
      [xx_tc1             ] depth: 0     be-depth: 0     inflt: 210  def: 20334 re-q: 2225699170 timeout: 0     msgs: 26067    e2e%:
        [V2 node17               ] state: 3 inflt: 6    rdy: 341  fin: 112      re-q: 43524556 msgs: 43524674 connected: 503h43m2s
        [V2 node17               ] state: 3 inflt: 6    rdy: 341  fin: 97       re-q: 43968929 msgs: 43969032 connected: 503h43m2s
        [V2 node16               ] state: 3 inflt: 0    rdy: 341  fin: 157      re-q: 47159484 msgs: 47159641 connected: 503h43m7s
      [xx_tc2             ] depth: 0     be-depth: 0     inflt: 210  def: 20334 re-q: 2225699170 timeout: 0     msgs: 26067    e2e%:
        [V2 node17               ] state: 3 inflt: 6    rdy: 341  fin: 112      re-q: 43524556 msgs: 43524674 connected: 503h43m2s
        [V2 node17               ] state: 3 inflt: 6    rdy: 341  fin: 97       re-q: 43968929 msgs: 43969032 connected: 503h43m2s
        [V2 node16               ] state: 3 inflt: 0    rdy: 341  fin: 157      re-q: 47159484 msgs: 47159641 connected: 503h43m7s
```
# 可以看到有3个channel
# 通过prometheus 查询，第二个channel积压过大

原因分析

首先从nsqd的代码里面，确定下关于topic这块nsqd都做了哪些，nsqd代码位于nsqd/nsqd.go nsqd/topic.go

Topic的获取

func (n *NSQD) GetTopic(topicName string) *Topic {
	// most likely, we already have this topic, so try read lock first.
	n.RLock()
	t, ok := n.topicMap[topicName]
	n.RUnlock()
	if ok {
		return t
	}

	n.Lock()

	t, ok = n.topicMap[topicName]
	if ok {
		n.Unlock()
		return t
	}
	deleteCallback := func(t *Topic) {
		n.DeleteExistingTopic(t.name)
	}
	t = NewTopic(topicName, &context{n}, deleteCallback)
	n.topicMap[topicName] = t

	n.Unlock()

	n.logf(LOG_INFO, "TOPIC(%s): created", t.name)
	// topic is created but messagePump not yet started

	// if loading metadata at startup, no lookupd connections yet, topic started after load
	if atomic.LoadInt32(&n.isLoading) == 1 {
		return t
	}

	// if using lookupd, make a blocking call to get the topics, and immediately create them.
	// this makes sure that any message received is buffered to the right channels
	lookupdHTTPAddrs := n.lookupdHTTPAddrs()
	if len(lookupdHTTPAddrs) > 0 {
		channelNames, err := n.ci.GetLookupdTopicChannels(t.name, lookupdHTTPAddrs)
		if err != nil {
			n.logf(LOG_WARN, "failed to query nsqlookupd for channels to pre-create for topic %s - %s", t.name, err)
		}
		for _, channelName := range channelNames {
			if strings.HasSuffix(channelName, "#ephemeral") {
				continue // do not create ephemeral channel with no consumer client
			}
			t.GetChannel(channelName)
		}
	} else if len(n.getOpts().NSQLookupdTCPAddresses) > 0 {
		n.logf(LOG_ERROR, "no available nsqlookupd to query for channels to pre-create for topic %s", t.name)
	}

	// now that all channels are added, start topic messagePump
	t.Start()
	return t
}

GetTopic函数用于获取topic对象，首先先尝试从topicMap表中获取，如果指定的topic存在，则直接返回topic对象，当topic不存在时，调用NewTopic接口创建，加入到 topicMap表中，如果启用了nslookup则会从lookup中获取该topic的所有channel，然后加入到topicMap中

Topic创建

func NewTopic(topicName string, ctx *context, deleteCallback func(*Topic)) *Topic {
	t := &Topic{
		name:              topicName,
		channelMap:        make(map[string]*Channel),
		memoryMsgChan:     nil,
		startChan:         make(chan int, 1),
		exitChan:          make(chan int),
		channelUpdateChan: make(chan int),
		ctx:               ctx,
		paused:            0,
		pauseChan:         make(chan int),
		deleteCallback:    deleteCallback,
		idFactory:         NewGUIDFactory(ctx.nsqd.getOpts().ID),
	}
	// create mem-queue only if size > 0 (do not use unbuffered chan)
	if ctx.nsqd.getOpts().MemQueueSize > 0 {
		t.memoryMsgChan = make(chan *Message, ctx.nsqd.getOpts().MemQueueSize)
	}
	if strings.HasSuffix(topicName, "#ephemeral") {
		t.ephemeral = true
		t.backend = newDummyBackendQueue()
	} else {
		dqLogf := func(level diskqueue.LogLevel, f string, args ...interface{}) {
			opts := ctx.nsqd.getOpts()
			lg.Logf(opts.Logger, opts.LogLevel, lg.LogLevel(level), f, args...)
		}
		t.backend = diskqueue.New(
			topicName,
			ctx.nsqd.getOpts().DataPath,
			ctx.nsqd.getOpts().MaxBytesPerFile,
			int32(minValidMsgLength),
			int32(ctx.nsqd.getOpts().MaxMsgSize)+minValidMsgLength,
			ctx.nsqd.getOpts().SyncEvery,
			ctx.nsqd.getOpts().SyncTimeout,
			dqLogf,
		)
	}

	t.waitGroup.Wrap(t.messagePump)

	t.ctx.nsqd.Notify(t)

	return t
}

NewTopic函数会首先创建一个topic结构，然后判断这个结构体是否为临时的topic，topic里面有个backend变量，会对topic进行持久化处理，对于临时的topic来讲，只会保存到内存中，因此backend使用newDummyBackendQueue函数初始化。该函数生成一个无任何功能的dummyBackendQueue结构；对于永久的topic，backend使用newDiskQueue函数返回diskQueue类型赋值，并开启新的goroutine来进行数据的持久化。 dummyBackendQueue和diskQueue都实现了Backend接口，因此，在之后可以使用backend统一处理。

随后，NewTopic函数开启一个新的goroutine来执行messagePump函数，该函数负责消息循环，将进入topic中的消息投递到channel中。

最后，NewTopic函数执行t.ctx.nsqd.Notify(t)，该函数在topic和channel创建、停止的时候调用， Notify函数通过执行PersistMetadata函数，将topic和channel的信息写到文件中。

消息投递

func (t *Topic) messagePump() {
	var msg *Message
	var buf []byte
	var err error
	var chans []*Channel
	var memoryMsgChan chan *Message
	var backendChan chan []byte

	// do not pass messages before Start(), but avoid blocking Pause() or GetChannel()
	for {
		select {
		case <-t.channelUpdateChan:
			continue
		case <-t.pauseChan:
			continue
		case <-t.exitChan:
			goto exit
		case <-t.startChan:
		}
		break
	}
	t.RLock()
	for _, c := range t.channelMap {
		chans = append(chans, c)
	}
	t.RUnlock()
	if len(chans) > 0 && !t.IsPaused() {
		memoryMsgChan = t.memoryMsgChan
		backendChan = t.backend.ReadChan()
	}

	// main message loop
	for {
		select {
		case msg = <-memoryMsgChan:
		case buf = <-backendChan:
			msg, err = decodeMessage(buf)
			if err != nil {
				t.ctx.nsqd.logf(LOG_ERROR, "failed to decode message - %s", err)
				continue
			}
		case <-t.channelUpdateChan:
			chans = chans[:0]
			t.RLock()
			for _, c := range t.channelMap {
				chans = append(chans, c)
			}
			t.RUnlock()
			if len(chans) == 0 || t.IsPaused() {
				memoryMsgChan = nil
				backendChan = nil
			} else {
				memoryMsgChan = t.memoryMsgChan
				backendChan = t.backend.ReadChan()
			}
			continue
		case <-t.pauseChan:
			if len(chans) == 0 || t.IsPaused() {
				memoryMsgChan = nil
				backendChan = nil
			} else {
				memoryMsgChan = t.memoryMsgChan
				backendChan = t.backend.ReadChan()
			}
			continue
		case <-t.exitChan:
			goto exit
		}

		for i, channel := range chans {
			chanMsg := msg
			// copy the message because each channel
			// needs a unique instance but...
			// fastpath to avoid copy if its the first channel
			// (the topic already created the first copy)
			if i > 0 {
				chanMsg = NewMessage(msg.ID, msg.Body)
				chanMsg.Timestamp = msg.Timestamp
				chanMsg.deferred = msg.deferred
			}
			if chanMsg.deferred != 0 {
				channel.PutMessageDeferred(chanMsg, chanMsg.deferred)
				continue
			}
			err := channel.PutMessage(chanMsg)
			if err != nil {
				t.ctx.nsqd.logf(LOG_ERROR,
					"TOPIC(%s) ERROR: failed to put msg(%s) to channel(%s) - %s",
					t.name, msg.ID, channel.name, err)
			}
		}
	}

exit:
	t.ctx.nsqd.logf(LOG_INFO, "TOPIC(%s): closing ... messagePump", t.name)
}

messagePump函数会获取channelMap表中的所有channel信息，调用PutMessage往所有channel投递信息，我们来看下nsq代码中对channel的定义如下：

Channel表示NSQ通道的具体类型（并实现队列接口），每个topic可以有多个通道，每个通道都有自己唯一的消费者。对于发往topic的信息，nsqd向该Topic下的所有Channel投递消息，而同一个Channel只投递一次，Channel下如果存在多个消费者，则随机选择一个消费者做投递。

channel的初始化流程

// 初始化channel，包括topic name channel name 和上文即成nsqd的配置，以及删除函数
func NewChannel(topicName string, channelName string, ctx *context,
	deleteCallback func(*Channel)) *Channel {

	c := &Channel{
		topicName:      topicName,
		name:           channelName,
		memoryMsgChan:  nil,
		clients:        make(map[int64]Consumer),
		deleteCallback: deleteCallback,
		ctx:            ctx,
	}
	// 当mem-queue>0时创建内存队列
	if ctx.nsqd.getOpts().MemQueueSize > 0 {
		c.memoryMsgChan = make(chan *Message, ctx.nsqd.getOpts().MemQueueSize)
	}
  // 初始化e2eProcessingLatencyStream值，用于统计延迟
  // E2EProcessingLatencyPercentiles 消息处理时间的百分比（通过逗号可以多次指定，默认为 none）
	if len(ctx.nsqd.getOpts().E2EProcessingLatencyPercentiles) > 0 {
		c.e2eProcessingLatencyStream = quantile.New(
			ctx.nsqd.getOpts().E2EProcessingLatencyWindowTime,
			ctx.nsqd.getOpts().E2EProcessingLatencyPercentiles,
		)
	}

	c.initPQ() // initPQ放到下面说明
  
  // 以#ephemeral结尾的表示临时队列，在最后一个客户端连接断开后，该channel会消失
  // 判断channel name是否以#ephemeral结尾，如果以#ephemeral结尾，即使队列大小超过mem-queue-size，也不会刷新到磁盘，
	if strings.HasSuffix(channelName, "#ephemeral") {
		c.ephemeral = true
		c.backend = newDummyBackendQueue()
	} else {
		dqLogf := func(level diskqueue.LogLevel, f string, args ...interface{}) {
			opts := ctx.nsqd.getOpts()
			lg.Logf(opts.Logger, opts.LogLevel, lg.LogLevel(level), f, args...)
		}
		// 调用diskqueue，持久化到本地磁盘
		backendName := getBackendName(topicName, channelName)
		c.backend = diskqueue.New(
			backendName,
			ctx.nsqd.getOpts().DataPath,
			ctx.nsqd.getOpts().MaxBytesPerFile,
			int32(minValidMsgLength),
			int32(ctx.nsqd.getOpts().MaxMsgSize)+minValidMsgLength,
			ctx.nsqd.getOpts().SyncEvery,
			ctx.nsqd.getOpts().SyncTimeout,
			dqLogf,
		)
	}
// 往nsqd注册chanel
	c.ctx.nsqd.Notify(c)

	return c
}

initPQ作用

func (c *Channel) initPQ() {
	pqSize := int(math.Max(1, float64(c.ctx.nsqd.getOpts().MemQueueSize)/10))

	c.inFlightMutex.Lock()
	c.inFlightMessages = make(map[MessageID]*Message)
	c.inFlightPQ = newInFlightPqueue(pqSize)
	c.inFlightMutex.Unlock()

	c.deferredMutex.Lock()
	c.deferredMessages = make(map[MessageID]*pqueue.Item)
	c.deferredPQ = pqueue.New(pqSize)
	c.deferredMutex.Unlock()
}

initPQ函数创建了两个字典inFlightMessages、deferredMessages和两个队列inFlightPQ、deferredPQ。在nsq中inFlight指的是正在投递但还没确认投递成功的消息，defferred指的是投递失败，等待重新投递的消息。 initPQ创建的字典和队列主要用于索引和存放这两类消息。其中两个字典使用消息ID作索引。

inFlightPQ使用newInFlightPqueue初始化，InFlightPqueue位于nsqd\in_flight_pqueue.go。 nsqd\in_flight_pqueue.go是nsq实现的一个优先级队列，提供了常用的队列操作，值得学习。

deferredPQ使用pqueue.New初始化，pqueue位于nsqd\pqueue.go，也是一个优先级队列。

分析到这块原因已经大致清晰，上次的升级扩容过程中由于多加了一个机房，导致多创建了一个channel，又没有配置消费者，导致队列积压

解决方法如下

安全起见，先把消费者的配置，修改正确，然后，调接口，暂停多余的channel，最后删除channel

curl -X POST http://127.0.0.1:4151/channel/pause?topic=name&channel=name
curl -X POST http://127.0.0.1:4151/channel/delete?topic=name&channel=name

得到的教训，下发配置的时候，一定要仔细，

种花家的小白

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录