pci设备驱动加载及workqueue基础知识

一、背景

主机启动后,出现systemd-udevd的hung_task。

而我们刚刚调整过某个virtio网卡从legacy模式到modern模式,所以问题大概率就是出现在udev的virtio网卡设备驱动加载过程。

二、原因

最终分析到的原因是,virtio-modern模式的设备在某些特殊场景下出现了异常,导致驱动的set status 0始终无法回读到正确的0值,所以会一直hung住查询。

而驱动probe并不是直接在udev的执行路径的,而是通过work_on_cpu()调度到相应cpu的工作线程处理。但udev的执行路径会等待probe的工作任务完成,从系统的call_trace调用栈可以看到,是hung在pci_call_probe->work_on_cpu->....的路径。

call_trace栈没保存下来。。。

三、work_on_cpu基础知识

分析一下pci_call_probe接口,udev路径、手动insmod pci驱动都会执行到。

在这里会找到pcie设备默认的numa node,然后找到一个可用的cpu。

static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
			  const struct pci_device_id *id)
{
	int error, node, cpu;
	int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
	struct drv_dev_and_id ddi = { drv, dev, id };

	/*
	 * Execute driver initialization on node where the device is
	 * attached.  This way the driver likely allocates its local memory
	 * on the right node.
	 */
	node = dev_to_node(&dev->dev);
	dev->is_probed = 1;

	cpu_hotplug_disable();

	/*
	 * Prevent nesting work_on_cpu() for the case where a Virtual Function
	 * device is probed from work_on_cpu() of the Physical device.
	 */
	if (node < 0 || node >= MAX_NUMNODES || !node_online(node) ||
	    pci_physfn_is_probed(dev))
		cpu = nr_cpu_ids;
	else
		cpu = cpumask_any_and(cpumask_of_node(node),
				      housekeeping_cpumask(hk_flags));

	if (cpu < nr_cpu_ids)
		error = work_on_cpu(cpu, local_pci_probe, &ddi);
	else
		error = local_pci_probe(&ddi);

	dev->is_probed = 0;
	cpu_hotplug_enable();
	return error;
}

然后调用work_on_cpu()会无缝调用work_on_cpu_key接口。

long work_on_cpu_key(int cpu, long (*fn)(void *),
		     void *arg, struct lock_class_key *key)
{
	struct work_for_cpu wfc = { .fn = fn, .arg = arg };

	INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
	schedule_work_on(cpu, &wfc.work);    //触发调度
	flush_work(&wfc.work);    //等待调度完成
	destroy_work_on_stack(&wfc.work);
	return wfc.ret;
}

所以最终udevd 或insmod驱动时提示hung的位置就是在flush_work里。

四、workqueue_struct结构

系统的工作队列是记录于全局的system_wq中。每cpu的异步队列就是system_wq来维持。这就是一个workqueue_struct结构。所以workqueue_struct比想象中要复杂,不是一个简单的链表挂上所有的work_struct就可以。

static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
	return queue_work_on(cpu, system_wq, work);    //system_wq全局变量
}

workqueue_struct的结构体如下,其实poll_workqueue才是work_struct直接挂载的结构。在创建workqueue_struct时:

1)如果WQ_UNBOUND没有置位,则会为每个cpu创建一个pollqueue和工作者线程,记录在cpu_pwqs结构。

1)如果WQ_UNBOUND标识置位,则不会绑定cpu,而是为每个NUMA node创建一个pollqueue,记录在num_pwq_tbl结构;

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
	struct list_head	pwqs;		/* WR: all pwqs of this wq */
	struct list_head	list;		/* PR: list of all workqueues */

	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */

	struct list_head	maydays;	/* MD: pwqs requesting rescue */
	struct worker		*rescuer;	/* MD: rescue worker */

	int			nr_drainers;	/* WQ: drain in progress */
	int			saved_max_active; /* WQ: saved pwq max_active */

	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
	char			*lock_name;
	struct lock_class_key	key;
	struct lockdep_map	lockdep_map;
#endif
	char			name[WQ_NAME_LEN]; /* I: workqueue name */

	/*
	 * Destruction of workqueue_struct is RCU protected to allow walking
	 * the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;

	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};

如果是schedule_work_on接口,会传入工作在哪个cpu上执行work_struct;

如果是schedule_work接口,会传入WORK_CPU_UNBOUND,此时就会选择schedule_work的执行路径当前的cpu。

static inline bool queue_work(struct workqueue_struct *wq,
			      struct work_struct *work)
{
	return queue_work_on(WORK_CPU_UNBOUND, wq, work);    //WORK_CPU_UNBOUND
}

选择哪个pollqueue的代码实现:

static void __queue_work(int cpu, struct workqueue_struct *wq,
			 struct work_struct *work)
{

	/* pwq which will be used unless @work is executing elsewhere */
	if (wq->flags & WQ_UNBOUND) {
		if (req_cpu == WORK_CPU_UNBOUND)    //当前cpu
			cpu = wq_select_unbound_cpu(raw_smp_processor_id());
		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
	} else {
		if (req_cpu == WORK_CPU_UNBOUND)
			cpu = raw_smp_processor_id();    //当前cpu
		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
	}

}

五、workqueue_struct的初始化

申请workqueue的接口有封装好几个,统一的入口就是alloc_workqueue。

可以看到不同的接口对应不同的workqueue类型,其中就有WQ_UNBOUND的参数。

#define alloc_ordered_workqueue(fmt, flags, args...)			\
	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED |		\
			__WQ_ORDERED_EXPLICIT | (flags), 1, ##args)

#define create_workqueue(name)						\
	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)				\
	alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |	\
			WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)				\
	alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

很多地方也会直接调用alloc_workqueue接口,比如system_wq。

	system_wq = alloc_workqueue("events", 0, 0);
	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
	system_long_wq = alloc_workqueue("events_long", 0, 0);
	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
					    WQ_UNBOUND_MAX_ACTIVE);
	system_freezable_wq = alloc_workqueue("events_freezable",
					      WQ_FREEZABLE, 0);
	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
					      WQ_POWER_EFFICIENT, 0);
	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,

​​​​​​​Linux中workqueues链表及work处理详解 - 知乎

  • 25
    点赞
  • 20
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值