一、背景
主机启动后,出现systemd-udevd的hung_task。
而我们刚刚调整过某个virtio网卡从legacy模式到modern模式,所以问题大概率就是出现在udev的virtio网卡设备驱动加载过程。
二、原因
最终分析到的原因是,virtio-modern模式的设备在某些特殊场景下出现了异常,导致驱动的set status 0始终无法回读到正确的0值,所以会一直hung住查询。
而驱动probe并不是直接在udev的执行路径的,而是通过work_on_cpu()调度到相应cpu的工作线程处理。但udev的执行路径会等待probe的工作任务完成,从系统的call_trace调用栈可以看到,是hung在pci_call_probe->work_on_cpu->....的路径。
call_trace栈没保存下来。。。
三、work_on_cpu基础知识
分析一下pci_call_probe接口,udev路径、手动insmod pci驱动都会执行到。
在这里会找到pcie设备默认的numa node,然后找到一个可用的cpu。
static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev,
const struct pci_device_id *id)
{
int error, node, cpu;
int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
struct drv_dev_and_id ddi = { drv, dev, id };
/*
* Execute driver initialization on node where the device is
* attached. This way the driver likely allocates its local memory
* on the right node.
*/
node = dev_to_node(&dev->dev);
dev->is_probed = 1;
cpu_hotplug_disable();
/*
* Prevent nesting work_on_cpu() for the case where a Virtual Function
* device is probed from work_on_cpu() of the Physical device.
*/
if (node < 0 || node >= MAX_NUMNODES || !node_online(node) ||
pci_physfn_is_probed(dev))
cpu = nr_cpu_ids;
else
cpu = cpumask_any_and(cpumask_of_node(node),
housekeeping_cpumask(hk_flags));
if (cpu < nr_cpu_ids)
error = work_on_cpu(cpu, local_pci_probe, &ddi);
else
error = local_pci_probe(&ddi);
dev->is_probed = 0;
cpu_hotplug_enable();
return error;
}
然后调用work_on_cpu()会无缝调用work_on_cpu_key接口。
long work_on_cpu_key(int cpu, long (*fn)(void *),
void *arg, struct lock_class_key *key)
{
struct work_for_cpu wfc = { .fn = fn, .arg = arg };
INIT_WORK_ONSTACK_KEY(&wfc.work, work_for_cpu_fn, key);
schedule_work_on(cpu, &wfc.work); //触发调度
flush_work(&wfc.work); //等待调度完成
destroy_work_on_stack(&wfc.work);
return wfc.ret;
}
所以最终udevd 或insmod驱动时提示hung的位置就是在flush_work里。
四、workqueue_struct结构
系统的工作队列是记录于全局的system_wq中。每cpu的异步队列就是system_wq来维持。这就是一个workqueue_struct结构。所以workqueue_struct比想象中要复杂,不是一个简单的链表挂上所有的work_struct就可以。
static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
return queue_work_on(cpu, system_wq, work); //system_wq全局变量
}
workqueue_struct的结构体如下,其实poll_workqueue才是work_struct直接挂载的结构。在创建workqueue_struct时:
1)如果WQ_UNBOUND没有置位,则会为每个cpu创建一个pollqueue和工作者线程,记录在cpu_pwqs结构。
1)如果WQ_UNBOUND标识置位,则不会绑定cpu,而是为每个NUMA node创建一个pollqueue,记录在num_pwq_tbl结构;
/*
* The externally visible workqueue. It relays the issued work items to
* the appropriate worker_pool through its pool_workqueues.
*/
struct workqueue_struct {
struct list_head pwqs; /* WR: all pwqs of this wq */
struct list_head list; /* PR: list of all workqueues */
struct mutex mutex; /* protects this wq */
int work_color; /* WQ: current work color */
int flush_color; /* WQ: current flush color */
atomic_t nr_pwqs_to_flush; /* flush in progress */
struct wq_flusher *first_flusher; /* WQ: first flusher */
struct list_head flusher_queue; /* WQ: flush waiters */
struct list_head flusher_overflow; /* WQ: flush overflow list */
struct list_head maydays; /* MD: pwqs requesting rescue */
struct worker *rescuer; /* MD: rescue worker */
int nr_drainers; /* WQ: drain in progress */
int saved_max_active; /* WQ: saved pwq max_active */
struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */
struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */
#ifdef CONFIG_SYSFS
struct wq_device *wq_dev; /* I: for sysfs interface */
#endif
#ifdef CONFIG_LOCKDEP
char *lock_name;
struct lock_class_key key;
struct lockdep_map lockdep_map;
#endif
char name[WQ_NAME_LEN]; /* I: workqueue name */
/*
* Destruction of workqueue_struct is RCU protected to allow walking
* the workqueues list without grabbing wq_pool_mutex.
* This is used to dump all workqueues from sysrq.
*/
struct rcu_head rcu;
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
};
如果是schedule_work_on接口,会传入工作在哪个cpu上执行work_struct;
如果是schedule_work接口,会传入WORK_CPU_UNBOUND,此时就会选择schedule_work的执行路径当前的cpu。
static inline bool queue_work(struct workqueue_struct *wq,
struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work); //WORK_CPU_UNBOUND
}
选择哪个pollqueue的代码实现:
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
/* pwq which will be used unless @work is executing elsewhere */
if (wq->flags & WQ_UNBOUND) {
if (req_cpu == WORK_CPU_UNBOUND) //当前cpu
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
} else {
if (req_cpu == WORK_CPU_UNBOUND)
cpu = raw_smp_processor_id(); //当前cpu
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
}
}
五、workqueue_struct的初始化
申请workqueue的接口有封装好几个,统一的入口就是alloc_workqueue。
可以看到不同的接口对应不同的workqueue类型,其中就有WQ_UNBOUND的参数。
#define alloc_ordered_workqueue(fmt, flags, args...) \
alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \
__WQ_ORDERED_EXPLICIT | (flags), 1, ##args)
#define create_workqueue(name) \
alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name))
#define create_freezable_workqueue(name) \
alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \
WQ_MEM_RECLAIM, 1, (name))
#define create_singlethread_workqueue(name) \
alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name)
很多地方也会直接调用alloc_workqueue接口,比如system_wq。
system_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,