pci设备驱动加载及workqueue基础知识

light_forest

已于 2024-05-05 17:32:13 修改

阅读量652

点赞数 25

分类专栏：内核运维文章标签： linux 后端

于 2024-05-04 15:33:45 首次发布

本文链接：https://blog.csdn.net/leiyanjie8995/article/details/137924138

版权

内核运维专栏收录该内容

11 篇文章 0 订阅

订阅专栏

一、背景

主机启动后，出现systemd-udevd的hung_task。

而我们刚刚调整过某个virtio网卡从legacy模式到modern模式，所以问题大概率就是出现在udev的virtio网卡设备驱动加载过程。

二、原因

最终分析到的原因是，virtio-modern模式的设备在某些特殊场景下出现了异常，导致驱动的set status 0始终无法回读到正确的0值，所以会一直hung住查询。

而驱动probe并不是直接在udev的执行路径的，而是通过work_on_cpu()调度到相应cpu的工作线程处理。但udev的执行路径会等待probe的工作任务完成，从系统的call_trace调用栈可以看到，是hung在pci_call_probe->work_on_cpu->....的路径。

call_trace栈没保存下来。。。

三、work_on_cpu基础知识

分析一下pci_call_probe接口，udev路径、手动insmod pci驱动都会执行到。

在这里会找到pcie设备默认的numa node，然后找到一个可用的cpu。

static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
			  const struct pci_device_id *id)
{
	int error, node, cpu;
	int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
	struct drv_dev_and_id ddi = { drv, dev, id };

	/*
	 * Execute driver initialization on node where the device is
	 * attached.  This way the driver likely allocates its local memory
	 * on the right node.
	 */
	node = dev_to_node(&dev->dev);
	dev->is_probed = 1;

	cpu_hotplug_disable();

	/*
	 * Prevent nesting work_on_cpu() for the case where a Virtual Function
	 * device is probed from work_on_cpu() of the Physical device.
	 */
	if (node < 0 || node >= MAX_NUMNODES || !node_online(node) ||
	    pci_physfn_is_probed(dev))
		cpu = nr_cpu_ids;
	else
		cpu = cpumask_any_and(cpumask_of_node(node),
				      housekeeping_cpumask(hk_flags));

	if (cpu < nr_cpu_ids)
		error = work_on_cpu(cpu, local_pci_probe, &ddi);
	else
		error = local_pci_probe(&ddi);

	dev->is_probed = 0;
	cpu_hotplug_enable();
	return error;
}

然后调用work_on_cpu()会无缝调用work_on_cpu_key接口。

long work_on_cpu_key(int cpu, long (*fn)(void *),
		     void *arg, struct lock_class_key *key)
{
	struct work_for_cpu wfc = { .fn = fn, .arg = arg };

	INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
	schedule_work_on(cpu, &wfc.work);    //触发调度
	flush_work(&wfc.work);    //等待调度完成
	destroy_work_on_stack(&wfc.work);
	return wfc.ret;
}

所以最终udevd 或insmod驱动时提示hung的位置就是在flush_work里。

四、workqueue_struct结构

系统的工作队列是记录于全局的system_wq中。每cpu的异步队列就是system_wq来维持。这就是一个workqueue_struct结构。所以workqueue_struct比想象中要复杂，不是一个简单的链表挂上所有的work_struct就可以。

static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
	return queue_work_on(cpu, system_wq, work);    //system_wq全局变量
}

workqueue_struct的结构体如下，其实poll_workqueue才是work_struct直接挂载的结构。在创建workqueue_struct时：

1）如果WQ_UNBOUND没有置位，则会为每个cpu创建一个pollqueue和工作者线程，记录在cpu_pwqs结构。

1）如果WQ_UNBOUND标识置位，则不会绑定cpu，而是为每个NUMA node创建一个pollqueue，记录在num_pwq_tbl结构；

/*
 * The externally visible workqueue.  It relays the issued work items to
 * the appropriate worker_pool through its pool_workqueues.
 */
struct workqueue_struct {
	struct list_head	pwqs;		/* WR: all pwqs of this wq */
	struct list_head	list;		/* PR: list of all workqueues */

	struct mutex		mutex;		/* protects this wq */
	int			work_color;	/* WQ: current work color */
	int			flush_color;	/* WQ: current flush color */
	atomic_t		nr_pwqs_to_flush; /* flush in progress */
	struct wq_flusher	*first_flusher;	/* WQ: first flusher */
	struct list_head	flusher_queue;	/* WQ: flush waiters */
	struct list_head	flusher_overflow; /* WQ: flush overflow list */

	struct list_head	maydays;	/* MD: pwqs requesting rescue */
	struct worker		*rescuer;	/* MD: rescue worker */

	int			nr_drainers;	/* WQ: drain in progress */
	int			saved_max_active; /* WQ: saved pwq max_active */

	struct workqueue_attrs	*unbound_attrs;	/* PW: only for unbound wqs */
	struct pool_workqueue	*dfl_pwq;	/* PW: only for unbound wqs */

#ifdef CONFIG_SYSFS
	struct wq_device	*wq_dev;	/* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
	char			*lock_name;
	struct lock_class_key	key;
	struct lockdep_map	lockdep_map;
#endif
	char			name[WQ_NAME_LEN]; /* I: workqueue name */

	/*
	 * Destruction of workqueue_struct is RCU protected to allow walking
	 * the workqueues list without grabbing wq_pool_mutex.
	 * This is used to dump all workqueues from sysrq.
	 */
	struct rcu_head		rcu;

	/* hot fields used during command issue, aligned to cacheline */
	unsigned int		flags ____cacheline_aligned; /* WQ: WQ_* flags */
	struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
	struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};

如果是schedule_work_on接口，会传入工作在哪个cpu上执行work_struct；

如果是schedule_work接口，会传入WORK_CPU_UNBOUND，此时就会选择schedule_work的执行路径当前的cpu。

static inline bool queue_work(struct workqueue_struct *wq,
			      struct work_struct *work)
{
	return queue_work_on(WORK_CPU_UNBOUND, wq, work);    //WORK_CPU_UNBOUND
}

选择哪个pollqueue的代码实现：

static void __queue_work(int cpu, struct workqueue_struct *wq,
			 struct work_struct *work)
{

	/* pwq which will be used unless @work is executing elsewhere */
	if (wq->flags & WQ_UNBOUND) {
		if (req_cpu == WORK_CPU_UNBOUND)    //当前cpu
			cpu = wq_select_unbound_cpu(raw_smp_processor_id());
		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
	} else {
		if (req_cpu == WORK_CPU_UNBOUND)
			cpu = raw_smp_processor_id();    //当前cpu
		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
	}

}

五、workqueue_struct的初始化

申请workqueue的接口有封装好几个，统一的入口就是alloc_workqueue。

可以看到不同的接口对应不同的workqueue类型，其中就有WQ_UNBOUND的参数。

#define alloc_ordered_workqueue(fmt, flags, args...)			\
	alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED |		\
			__WQ_ORDERED_EXPLICIT | (flags), 1, ##args)

#define create_workqueue(name)						\
	alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name)				\
	alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND |	\
			WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name)				\
	alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)

很多地方也会直接调用alloc_workqueue接口，比如system_wq。

	system_wq = alloc_workqueue("events", 0, 0);
	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
	system_long_wq = alloc_workqueue("events_long", 0, 0);
	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
					    WQ_UNBOUND_MAX_ACTIVE);
	system_freezable_wq = alloc_workqueue("events_freezable",
					      WQ_FREEZABLE, 0);
	system_power_efficient_wq = alloc_workqueue("events_power_efficient",
					      WQ_POWER_EFFICIENT, 0);
	system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
					      WQ_FREEZABLE | WQ_POWER_EFFICIENT,

Linux中workqueues链表及work处理详解 - 知乎

light_forest

关注

25
点赞
踩
20

收藏

觉得还不错? 一键收藏
0
评论
pci设备驱动加载及workqueue基础知识

但udev的执行路径会等待probe的工作任务完成，从系统的call_trace调用栈可以看到，是hung在pci_call_probe->work_on_cpu->....的路径。系统的工作队列是记录于全局的system_wq中。workqueue_struct的结构体如下，其实poll_workqueue才是work_struct直接挂载的结构。如果是schedule_work接口，会传入WORK_CPU_UNBOUND，此时就会选择schedule_work的执行路径当前的cpu。
复制链接

扫一扫

专栏目录