rcu nocb特性浅析

博客http://t.csdnimg.cn/EKdgI介绍了rcu的基本原理,但是在实际应用中,rcu的nocb特性也经常被使用,那nocb是个什么东西呢?
本文代码基于linux内核4.19.195

nocb的背景

一开始是没有nocb特性的,原先的rcu实现也能够在实际业务场景中正常使用。不过细心的开发者总是不满足于当前的现状,他们发现,如果能把处理rcu回调的资源占用,从nohz_full的核转移出去的话,nohz_full的核就能获得一个更加纯粹的执行环境,从而,rcu nocb诞生了,这里我理解nocb指的是对应被配置的cpu上,没有call back的损耗(简写为nocb)。

nocb的实现

既然都没有call back了,那call_rcu怎么处理呢?

static void
__call_rcu(struct rcu_head *head, rcu_callback_t func,
	   struct rcu_state *rsp, int cpu, bool lazy)
{
	******
	head->func = func;
	head->next = NULL;
	local_irq_save(flags);
	rdp = this_cpu_ptr(rsp->rda);

	/* Add the callback to our list. */
	if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
		int offline;

		if (cpu != -1)
			rdp = per_cpu_ptr(rsp->rda, cpu);
		if (likely(rdp->mynode)) {
			/* Post-boot, so this should be for a no-CBs CPU. */
			offline = !__call_rcu_nocb(rdp, head, lazy, flags); //nocb的主要处理函数
			WARN_ON_ONCE(offline);
			/* Offline CPU, _call_rcu() illegal, leak callback.  */
			local_irq_restore(flags);
			return;
		}
		/*
		 * Very early boot, before rcu_init().  Initialize if needed
		 * and then drop through to queue the callback.
		 */
		BUG_ON(cpu != -1);
		WARN_ON_ONCE(!rcu_is_watching());
		if (rcu_segcblist_empty(&rdp->cblist))
			rcu_segcblist_init(&rdp->cblist);
	}
	*******
}

与普通核的call_rcu执行流程不一样,nocb核调用call_rcu会走进__call_rcu_nocb,__call_rcu_nocb又会调用__call_rcu_nocb_enqueue

/*
 * Enqueue the specified string of rcu_head structures onto the specified
 * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
 * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
 * counts are supplied by rhcount and rhcount_lazy.
 *
 * If warranted, also wake up the kthread servicing this CPUs queues.
 */
static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
				    struct rcu_head *rhp,
				    struct rcu_head **rhtp,
				    int rhcount, int rhcount_lazy,
				    unsigned long flags)
{
	int len;
	struct rcu_head **old_rhpp;
	struct task_struct *t;

	/* Enqueue the callback on the nocb list and update counts. */
	atomic_long_add(rhcount, &rdp->nocb_q_count); //修改计数
	/* rcu_barrier() relies on ->nocb_q_count add before xchg. */
	old_rhpp = xchg(&rdp->nocb_tail, rhtp);
	WRITE_ONCE(*old_rhpp, rhp); //添加到链表尾
	atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
	smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */

	/* If we are not being polled and there is a kthread, awaken it ... */
	t = READ_ONCE(rdp->nocb_kthread);
	if (rcu_nocb_poll || !t) { //poll的话就没必要做什么了
		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
				    TPS("WakeNotPoll"));
		return;
	}
	len = atomic_long_read(&rdp->nocb_q_count);
	if (old_rhpp == &rdp->nocb_head) { //如果queue was empty
		if (!irqs_disabled_flags(flags)) {
			/* ... if queue was empty ... */
			wake_nocb_leader(rdp, false);
			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
					    TPS("WakeEmpty"));
		} else {
			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
					       TPS("WakeEmptyIsDeferred"));
		}
		rdp->qlen_last_fqs_check = 0;
	} else if (len > rdp->qlen_last_fqs_check + qhimark) { //cb链表长度太长了
		/* ... or if many callbacks queued. */
		if (!irqs_disabled_flags(flags)) {
			wake_nocb_leader(rdp, true);
			trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
					    TPS("WakeOvf"));
		} else {
			wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE_FORCE,
					       TPS("WakeOvfIsDeferred"));
		}
		rdp->qlen_last_fqs_check = LONG_MAX / 2;
	} else {
		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
	}
	return;
}

__call_rcu_nocb_enqueue函数使用了典型的单链表插入的代码编写手法。从代码中我们可以看到,nocb核的rcu_head是被插入到了rdp->nocb_tail链表中;这里不考虑rcu_nocb_poll的情况,在完成链表插入动作后,接下来会唤醒rcu nocb的后台线程完成call back的处理。这里的唤醒动作,只是通知后台线程需要等待一个宽限期,此时必然是无法去调用call back的。
rcu nocb的后台线程由rcu_nocb_kthread实现。此外对于一个配置了nohz_full及isolcpus的cmdline的系统来说(通常来说一个系统的nohz_full、isolcpus、nocb配置的参数是一样的),rcu nocb的后台线程的亲和性同2号线程(kthreadd)一样,只允许运行在非nohz_full的核,这也就解释了rcu nocb是如何将nocb cpu的rcu call back的消耗转移到其他cpu的原理。

static int rcu_nocb_kthread(void *arg)
{
	int c, cl;
	unsigned long flags;
	struct rcu_head *list;
	struct rcu_head *next;
	struct rcu_head **tail;
	struct rcu_data *rdp = arg;

	/* Each pass through this loop invokes one batch of callbacks */
	for (;;) {
		/* Wait for callbacks. */
		if (rdp->nocb_leader == rdp)
			nocb_leader_wait(rdp);
		else
			nocb_follower_wait(rdp);

		/* Pull the ready-to-invoke callbacks onto local list. */ 
		raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
		list = rdp->nocb_follower_head;
		rdp->nocb_follower_head = NULL; //nocb_follower_head置空
		tail = rdp->nocb_follower_tail;
		rdp->nocb_follower_tail = &rdp->nocb_follower_head;
		raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
		BUG_ON(!list);
		trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));

		/* Each pass through the following loop invokes a callback. */
		trace_rcu_batch_start(rdp->rsp->name,
				      atomic_long_read(&rdp->nocb_q_count_lazy),
				      atomic_long_read(&rdp->nocb_q_count), -1);
		c = cl = 0;
		while (list) {
			next = list->next;
			/* Wait for enqueuing to complete, if needed. */ //看注释
			while (next == NULL && &list->next != tail) {
				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
						    TPS("WaitQueue"));
				schedule_timeout_interruptible(1);
				trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
						    TPS("WokeQueue"));
				next = list->next;
			}
			debug_rcu_head_unqueue(list);
			local_bh_disable();
			if (__rcu_reclaim(rdp->rsp->name, list)) //调用callback
				cl++;
			c++;
			local_bh_enable();
			cond_resched_tasks_rcu_qs();
			list = next;
		}
		trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
		smp_mb__before_atomic();  /* _add after CB invocation. */
		atomic_long_add(-c, &rdp->nocb_q_count);
		atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
	}
	return 0;
}

代码中的while循环,会按序遍历nocb核的rcu_head的链表,逐个调用call back函数。
而代码中for循环最前面wait函数,则是在等待宽限期的完成。内核初始化的时候,会把一部分nocb核配置为follower,一部分nocb核配置为leader(具体谁是leader谁是follower没有仔细研究),这么做的目的主要还是为了降低开销,leader核的nocb线程在每次宽限期过去后都需要被唤醒,来检查包括他自身以及他的follow核上是否有nocb的回调需要被调用,有的话会唤醒相应的follow核。在函数nocb_leader_wait中,会通过相关判断,最后把可以被调用的rcu_head挪到rdp->nocb_follower_head上。
那么,在宽限期完成后,是谁去唤醒的nocb后台线程呢?
我们知道,在一个宽限期结束后,rcu后台线程(注意与nocb后台线程区别),会调用rcu_gp_cleanup函数去完成本宽限期的清理工作,rcu_gp_cleanup中会调用如下代码:

rcu_gp_cleanup()->
	rcu_for_each_node_breadth_first(rsp, rnp){ ->
		rcu_nocb_gp_cleanup(sq);
	}

rcu_nocb_gp_cleanup会唤醒nocb后台线程去完成call back函数的调用。奇怪的是为啥这个唤醒是放在rcu_for_each_node_breadth_first这个for循环里的?记个TODO后续继续研究吧。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值