Linux 之软中断深度分析_linux驱动软中断-CSDN博客

本文链接：https://blog.csdn.net/z20230508/article/details/135271798

一、软中断机制设计的原因

为了中断程序运行的又快又多，江湖上，中断掌门人培养了两个高手，一个是中断上半部，另一个是中断下半部。

中断上半部就是现在内核中的硬中断处理程序，可以参考我的另一篇文章。

中断下半部就是softirq, tasklet, workqueue。文章后面再介绍tasklet,workqueue;

二、与硬中断对比

在中断的下半部机制中，软中断是执行效率最快的，同时，相对来说对于开发者也是最麻烦的，它的麻烦在于两个因素：

同一个软中断支持在不同的 cpu 上并发执行，这也就导致了软中断执行的代码需要考虑 SMP 下的并发，实现上要更复杂。
软中断不支持动态的定义，只能将软中断静态地编译到内核镜像中，而不能使用外部模块加载的方式
它的执行效率也体现在两点上：

因为支持 cpu 上并发执行，所以通常情况下不需要等待(tasklet无法并发执行，且有其他限制)，但是硬中断能抢占它执行。
通常情况下软中断执行在中断上下文中，硬中断结束之后会立马执行软中断，为什么说是通常情况下运行在中断上下文而不是一定运行在中断上下文？这是因为在特殊情况下，软中断也会由内核线程（ksoftirqd）来实现.

三、软中断的源码分析

源码文件 kernel/softirq.c

软中断的注册：open_softirq

void open_softirq(int nr, void (*action)(struct softirq_action *))
{
softirq_vec[nr].action = action;
}

softirq_vec[] 是在内核中声明的数组，如下：

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;

NR_SOFTIRQS 是代表软中断的类型，路径："include/linux/interrupt.h" 806L

数值越小优先级越大

enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
IRQ_POLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */

NR_SOFTIRQS
};

在内核中，注册软中断的例子：

start_kernel->

init_timers();
hrtimers_init();
softirq_init();

这上面几个函数都注册了软中断，具体拿softirq_init();

void __init softirq_init(void)
{
int cpu;

for_each_possible_cpu(cpu) {
per_cpu(tasklet_vec, cpu).tail =
&per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
}

open_softirq(TASKLET_SOFTIRQ, tasklet_action);
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}

注册软中断后，就是什么时候触发软中断？

有以下情况触发软中断：

do_irq完成I/O中断时调用irq_exit。
系统使用I/O APIC,在处理完本地时钟中断时。
local_bh_enable，即开启本地软中断时。
SMP系统中，cpu处理完被CALL_FUNCTION_VECTOR处理器间中断所触发的函数时。
ksoftirqd/n线程被唤醒时

触发方式：

一、raise_softirq 方式

void raise_softirq(unsigned int nr)
{
unsigned long flags;

local_irq_save(flags);
raise_softirq_irqoff(nr);
local_irq_restore(flags);
}

inline void raise_softirq_irqoff(unsigned int nr)
{
__raise_softirq_irqoff(nr);

/*
* If we're in an interrupt or softirq, we're done
* (this also catches softirq-disabled code). We will
* actually run the softirq once we return from
* the irq or softirq.
*
* Otherwise we wake up ksoftirqd to make sure we
* schedule the softirq soon.
*/
if (!in_interrupt() && should_wake_ksoftirqd())
wakeup_softirqd(); 如果不处于硬中断或软中断，唤醒 ksoftirqd/n
}

void __raise_softirq_irqoff(unsigned int nr)
{
lockdep_assert_irqs_disabled();
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}

实际上即以软中断类型nr作为偏移量置位每cpu变量irq_stat[cpu_id]的成员变量__softirq_pending，这也是同一类型软中断可以在多个cpu上并行运行的根本原因。

那么irq_stat 是通过位图来表示中断有了。

Percpu变量irq_cpustat_t中的__softirq_pending是等待处理的软中断的位图，通过设置此变量即可告诉内核该执行哪些软中断。

#ifndef __ARCH_IRQ_STAT
DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat);
EXPORT_PER_CPU_SYMBOL(irq_stat);
#endif
通过定义percpu 变量，可以让每个cpu 都可以并发的执行软中断。

typedef struct {
unsigned int __softirq_pending; 32位
#ifdef ARCH_WANTS_NMI_IRQSTAT
unsigned int __nmi_count;
#endif
} ____cacheline_aligned irq_cpustat_t;

通过位图，可以知道哪个软中断需要处理，接着看wakeup_softirqd()；

static void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __this_cpu_read(ksoftirqd);

if (tsk && tsk->state != TASK_RUNNING)
wake_up_process(tsk); 软中断就是线程，参与调度的
}

DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
ksoftirqd 也是每个cpu变量

1、接下来分析这个 ksoftirqd线程是如何注册生成的？

static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
};

static __init int spawn_ksoftirqd(void)
{
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
takeover_tasklets);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));

return 0;
}
early_initcall(spawn_ksoftirqd);

smpboot_register_percpu_thread(), 这个函数是在 kernel/smpboot.c 文件中

int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
unsigned int cpu;
int ret = 0;

get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu); 创建线程，还要在进一步看
if (ret) {
smpboot_destroy_threads(plug_thread);
goto out;
}
smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);

static int
__smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
{
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); //ht->store 是 &ksoftirqd
struct smpboot_thread_data *td;

if (tsk)
return 0;

td = kzalloc_node(sizeof(*td), GFP_KERNEL, cpu_to_node(cpu));
if (!td)
return -ENOMEM;
td->cpu = cpu;
td->ht = ht; //td 变量包含 softirq_threads

tsk = kthread_create_on_cpu(smpboot_thread_fn, td, cpu,
ht->thread_comm);
if (IS_ERR(tsk)) {
kfree(td);
return PTR_ERR(tsk);
}
kthread_set_per_cpu(tsk, cpu);
/*
* Park the thread so that it could start right on the CPU
* when it is available.
*/
kthread_park(tsk);
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
if (ht->create) {
/*
* Make sure that the task has actually scheduled out
* into park position, before calling the create
* callback. At least the migration thread callback
* requires that the task is off the runqueue.
*/
if (!wait_task_inactive(tsk, TASK_PARKED))
WARN_ON(1);
else
ht->create(cpu);
}
return 0;
}

static int smpboot_thread_fn(void *data)
{
struct smpboot_thread_data *td = data;
struct smp_hotplug_thread *ht = td->ht;

while (1) {
set_current_state(TASK_INTERRUPTIBLE);
preempt_disable();
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
/* cleanup must mirror setup */
if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
}

if (kthread_should_park()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->park && td->status == HP_THREAD_ACTIVE) {
BUG_ON(td->cpu != smp_processor_id());
ht->park(td->cpu);
td->status = HP_THREAD_PARKED;
}
kthread_parkme();
/* We might have been woken for stop */
continue;
}

BUG_ON(td->cpu != smp_processor_id());

/* Check for state change setup */
switch (td->status) {
case HP_THREAD_NONE:
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->setup)
ht->setup(td->cpu);
td->status = HP_THREAD_ACTIVE;
continue;

case HP_THREAD_PARKED:
__set_current_state(TASK_RUNNING);
preempt_enable();
if (ht->unpark)
ht->unpark(td->cpu);
td->status = HP_THREAD_ACTIVE;
continue;
}

if (!ht->thread_should_run(td->cpu)) {
preempt_enable_no_resched();
schedule();
} else {
__set_current_state(TASK_RUNNING);
preempt_enable();
ht->thread_fn(td->cpu);
}
}
}

分析：smpboot_thread_fn 这个函数是调用的核心，通过上面红色标注的函数进行判断，如果有softirq_pending,就会调用 ht->thread_fn(td->cpu). 回到 softirq_threads这个实例，就可以知道调用softirq_threads函数。

static void run_ksoftirqd(unsigned int cpu)
{
ksoftirqd_run_begin();
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
ksoftirqd_run_end();
cond_resched();
return;
}
ksoftirqd_run_end();
}

接着就走到软中断核心函数 __do_softirq（）；

二、代码流程调用到__do_softirq

举一个用例，do_irq退出时：

do_irq->irq_exit->invoke_softirq->do_softirq->__do_softirq

下面接着看 __do_softirq()

asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
int max_restart = MAX_SOFTIRQ_RESTART;
struct softirq_action *h;
bool in_hardirq;
__u32 pending;
int softirq_bit;

/*
* Mask out PF_MEMALLOC as the current task context is borrowed for the
* softirq. A softirq handled, such as network RX, might set PF_MEMALLOC
* again if the socket is related to swapping.
*/
current->flags &= ~PF_MEMALLOC;

pending = local_softirq_pending();

softirq_handle_begin();
in_hardirq = lockdep_softirq_start();
account_softirq_enter(current);

restart:
/* Reset the pending bitmask before enabling irqs */
set_softirq_pending(0);

local_irq_enable();

h = softirq_vec;

while ((softirq_bit = ffs(pending))) {
unsigned int vec_nr;
int prev_count;

h += softirq_bit - 1;

vec_nr = h - softirq_vec;
prev_count = preempt_count();

kstat_incr_softirqs_this_cpu(vec_nr);

trace_softirq_entry(vec_nr);
h->action(h); 这个就是执行的具体的软中断处理程序。
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
vec_nr, softirq_to_name[vec_nr], h->action,
prev_count, preempt_count());
preempt_count_set(prev_count);
}
h++;
pending >>= softirq_bit;
}

if (!IS_ENABLED(CONFIG_PREEMPT_RT) &&
__this_cpu_read(ksoftirqd) == current)
rcu_softirq_qs();

local_irq_disable();

pending = local_softirq_pending();
if (pending) {
if (time_before(jiffies, end) && !need_resched() &&
--max_restart)
goto restart;

wakeup_softirqd(); 重复一段时间，还有pending,就唤醒线程
}

account_softirq_exit(current);
lockdep_softirq_end(in_hardirq);
softirq_handle_end();
current_restore_flags(old_flags, PF_MEMALLOC);
}

通过上面分析：通常情况下软中断执行在中断上下文中，硬中断结束之后会立马执行软中断，为什么说是通常情况下运行在中断上下文而不是一定运行在中断上下文？这是因为在特殊情况下，软中断也会由内核线程（ksoftirqd）来实现.。

总结：软中断核心函数 __do_softirq() 也会在一定循环处理后，还会有软中断位置为1，就会唤醒ksoftirqd/n 线程就行处理，这个目的是为了软中断不能一直占有CPU，软中断线程化可以调度，大家才能公平，所以在实时系统中，硬中断线程化也是为了可以参与调度。再来看ksoftirqd/n 这个线程，有两个大循环，一个外循环函数smpboot_thread_fn 是进行判断有没有软中断置位，如果有才会调用ht->thread_fn(td->cpu);进入软中断核心函数__do_softirq();一个内循环__do_softirq()。

四、tasklet

由于软中断必须使用可重入函数，这就导致设计上的复杂度变高，作为设备驱动程序的开发者来说，增加了负担。而如果某种应用并不需要在多个CPU上并行执行，那么软中断其实是没有必要的。因此诞生了弥补以上两个要求的tasklet。它具有以下特性：
a）一种特定类型的tasklet只能运行在一个CPU上，不能并行，只能串行执行。
b）多个不同类型的tasklet可以并行在多个CPU上。
c）软中断是静态分配的，在内核编译好之后，就不能改变。但tasklet就灵活许多，可以在运行时改变（比如添加模块时）。
tasklet是在两种软中断类型的基础上实现的，因此如果不需要软中断的并行特性，tasklet就是最好的选择。也就是说tasklet是软中断的一种特殊用法，即延迟情况下的串行执行。

源码分析：

init/main.c 开始定义tasklet 的表

asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
{

....

softirq_init(); // 初始化 tasklet 软中断表

.....
}

kernel/softirq.c

void __init softirq_init(void)
{
int cpu;

for_each_possible_cpu(cpu) {
per_cpu(tasklet_vec, cpu).tail =
&per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
} //上面又是per_cpu 的声明，每个cpu都有

open_softirq(TASKLET_SOFTIRQ, tasklet_action); /两个优先级，对应两个处理函数
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}

接着分析tasklet_action 和tasklet_hi_action 这个两个软中断函数，所以说， tasklet 是软中断的特例，因为这两个函数是注册在softirq 的表softirq_vec[TASKLET_SOFTIRQ] 和softirq_vec[HI_SOFTIRQ] 中的。只有softirq 调用这两个函数时，才会处理tasklet 注册进来的软中的。

static __latent_entropy void tasklet_action(struct softirq_action *a)
{
tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ);
}

static __latent_entropy void tasklet_hi_action(struct softirq_action *a)
{
tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ);
}

都是调用同一个函数 tasklet_action_common

static void tasklet_action_common(struct softirq_action *a,
struct tasklet_head *tl_head,
unsigned int softirq_nr)
{
struct tasklet_struct *list;

local_irq_disable();
list = tl_head->head;
tl_head->head = NULL;
tl_head->tail = &tl_head->head;
local_irq_enable();

while (list) {
struct tasklet_struct *t = list; //开始获取 tasklet_struct 的第一个

list = list->next;

if (tasklet_trylock(t)) {
if (!atomic_read(&t->count)) {
if (!tasklet_clear_sched(t)) //这个判断是保证同一个类型的tasklet 在同一个CPU上执行
BUG();
if (t->use_callback)
t->callback(t);
else
t->func(t->data);
tasklet_unlock(t);
continue;
}
tasklet_unlock(t);
}

local_irq_disable();
t->next = NULL;
*tl_head->tail = t;
tl_head->tail = &t->next;
__raise_softirq_irqoff(softirq_nr); //softirq对应的位清零
local_irq_enable();
}
}

看一下tasklet 重要的数据结构：

struct tasklet_struct
{
struct tasklet_struct *next;
unsigned long state;
atomic_t count;
bool use_callback;
union {
void (*func)(unsigned long data); //这个初始化的函数比较重要
void (*callback)(struct tasklet_struct *t);
};
unsigned long data;
};

void tasklet_init(struct tasklet_struct *t,
void (*func)(unsigned long), unsigned long data)
{
t->next = NULL;
t->state = 0;
atomic_set(&t->count, 0);
t->func = func;
t->use_callback = false;
t->data = data;
}
EXPORT_SYMBOL(tasklet_init); //初始化一个 tasklet_struct

void __tasklet_schedule(struct tasklet_struct *t)
{
__tasklet_schedule_common(t, &tasklet_vec,
TASKLET_SOFTIRQ);
}
EXPORT_SYMBOL(__tasklet_schedule);

void __tasklet_hi_schedule(struct tasklet_struct *t)
{
__tasklet_schedule_common(t, &tasklet_hi_vec,
HI_SOFTIRQ);
}
EXPORT_SYMBOL(__tasklet_hi_schedule);
//这两个函数是注册进链表

static void __tasklet_schedule_common(struct tasklet_struct *t,
struct tasklet_head __percpu *headp,
unsigned int softirq_nr)
{
struct tasklet_head *head;
unsigned long flags;

local_irq_save(flags);
head = this_cpu_ptr(headp);
t->next = NULL;
*head->tail = t;
head->tail = &(t->next);
raise_softirq_irqoff(softirq_nr);
local_irq_restore(flags);
}

总结： tasklet 是softirq 的一个特例，执行softirq 才能执行到 tasklet.

五、workqueue

从上面的介绍看以看出，软中断可能运行在中断上下文中，因此不能阻塞和睡眠，而tasklet使用软中断实现，当然也不能阻塞和睡眠。但如果某延迟处理函数需要睡眠或者阻塞呢？没关系工作队列就可以如您所愿了。
把推后执行的任务叫做工作（work），描述它的数据结构为work_struct ，这些工作以队列结构组织成工作队列（workqueue），其数据结构为workqueue_struct ，而工作线程就是负责执行工作队列中的工作。系统默认的工作者线程为events。
工作队列(work queue)是另外一种将工作推后执行的形式。工作队列可以把工作推后，交由一个内核线程去执行—这个下半部分总是会在进程上下文执行，但由于是内核线程，其不能访问用户空间。最重要特点的就是工作队列允许重新调度甚至是睡眠。
通常，在工作队列和软中断/tasklet中作出选择非常容易。可使用以下规则：
- 如果推后执行的任务需要睡眠，那么只能选择工作队列。
- 如果推后执行的任务需要延时指定的时间再触发，那么使用工作队列，因为其可以利用timer延时(内核定时器实现)。
- 如果推后执行的任务需要在一个tick之内处理，则使用软中断或tasklet，因为其可以抢占普通进程和内核线程，同时不可睡眠。
- 如果推后执行的任务对延迟的时间没有任何要求，则使用工作队列，此时通常为无关紧要的任务。
实际上，工作队列的本质就是将工作交给内核线程处理，因此其可以用内核线程替换。但是内核线程的创建和销毁对编程者的要求较高，而工作队列实现了内核线程的封装，不易出错，所以我们也推荐使用工作队列。

5.1 workqueue.c 数据结构中，涉及一些概念

struct work_struct :工作

struct workqueue_struct：工作的集合，即工作队列，和work_struct 是一对多的关系；

struct worker ：工人，在代码中 worker 对应一个 work_thread() 内核线程；

struct worker_pool：工人的集合，即工人队列，和worker 是一对多的关系；

struct pool_workqueue：中间人 / 中介，负责建立起 workqueue 和 worker_pool 之间的关系。workqueue 和 pool_workqueue是一对多的关系，pool_workqueue 和 worker_pool 是一对一的关系。

最终的目的还是把 work( 工作 ) 传递给 worker( 工人 ) 去执行，中间的数据结构和各种关系目的是把这件事组织的更加清晰高效。借助网友生动的图：

5.1.1 worker_pool

每个执行 work 的线程叫做 worker，一组 worker 的集合叫做 worker_pool。

worker_pool 分成两类：

1、normal worker_pool，给通用的 workqueue 使用；

2、unbound worker_pool，给 WQ_UNBOUND 类型的的 workqueue 使用；

举例：

上面截图中，kworker/u24:* ;线程名包含u 字符的是属于WQ_UNBOUND类，其他是属于

normal worker_pool类。

1.1 normal worker_pool 细分两个

默认 work 是在 normal worker_pool 中处理的。系统的规划是每个 CPU 创建两个 normal worker_pool：一个 normal 优先级 (nice=0)、一个高优先级 (nice=HIGHPRI_NICE_LEVEL)（HIGHPRI_NICE_LEVEL=-20），对应创建出来的 worker 的进程 nice 不一样。

每个 worker 对应一个 worker_thread() 内核线程，一个 worker_pool 包含一个或者多个 worker，worker_pool 中 worker 的数量是根据 worker_pool 中 work 的负载来动态增减的。这个后面源码分析中，会重点说到。

我们可以通过 ps | grep kworker 命令来查看所有 worker 对应的内核线程，normal worker_pool 对应内核线程 (worker_thread()) 的命名规则是这样的：

static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker = NULL;
int id = -1;
char id_buf[16];

/* ID is needed to determine kthread name */
id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);
if (id < 0)
goto fail;

worker = alloc_worker(pool->node);
if (!worker)
goto fail;

worker->id = id;

if (pool->cpu >= 0) //建立线程名的代码
snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
pool->attrs->nice < 0 ? "H" : ""); //normal worker_pool 的线程名
else
snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); //WQ_UNBOUND 线程名

worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;

set_user_nice(worker->task, pool->attrs->nice);
kthread_bind_mask(worker->task, pool->attrs->cpumask);

/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);

/* start the newly created worker */
raw_spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;
worker_enter_idle(worker);
wake_up_process(worker->task);
raw_spin_unlock_irq(&pool->lock);

return worker;

fail:
if (id >= 0)
ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}

so 类似名字是 normal worker_pool：

shell@PRO5:/ $ ps | grep "kworker"

root 14 2 0 0 worker_thr 0000000000 S kworker/1:0H// cpu1 高优先级 worker_pool 的第 0 个 worker 进程

root 17 2 0 0 worker_thr 0000000000 S kworker/2:0// cpu2 低优先级 worker_pool 的第 0 个 worker 进程

root 18 2 0 0 worker_thr 0000000000 S kworker/2:0H// cpu2 高优先级 worker_pool 的第 0 个 worker 进程

root 23699 2 0 0 worker_thr 0000000000 S kworker/0:1// cpu0 低优先级 worker_pool 的第 1 个 worker 进程

normal worker_pool 数据结构关系图：cpu0 ~cpuN 都是如此，低优先级和高优先级关系

借助网友的曼妙的拓扑图：

1.2 unbound worker_pool

大部分的 work 都是通过 normal worker_pool 来执行的 ( 例如通过 schedule_work()、schedule_work_on() 压入到系统 workqueue(system_wq) 中的 work)，最后都是通过 normal worker_pool 中的 worker 来执行的。这些 worker 是和某个 CPU 绑定的，work 一旦被 worker 开始执行，都是一直运行到某个 CPU 上的不会切换 CPU。

unbound worker_pool 相对应的意思，就是 worker 可以在多个 CPU 上调度的。但是他其实也是绑定的，只不过它绑定的单位不是 CPU 而是 node。所谓的 node 是对 NUMA(Non Uniform Memory Access Architecture) 系统来说的，NUMA 可能存在多个 node，每个 node 可能包含一个或者多个 CPU。

unbound worker_pool 对应内核线程 (worker_thread()) 的命名规则是这样的：

hell@PRO5:/ $ ps | grep "kworker"

root 23906 2 0 0 worker_thr 0000000000 S kworker/u20:2// unbound pool 20 的第 2 个 worker 进程

root 24564 2 0 0 worker_thr 0000000000 S kworker/u20:0// unbound pool 20 的第 0 个 worker 进程

root 24622 2 0 0 worker_thr 0000000000 S kworker/u21:1// unbound pool 21 的第 1 个 worker 进程

unbound worker_pool 也分成两类：

1、unbound_std_wq。每个 node 对应一个 worker_pool，多个 node 就对应多个 worker_pool;

2、ordered_wq。所有 node 对应一个 default worker_pool；

1.2.1 unbound_std_wq 数据结构关系

每个 node 对应一个 worker_pool，多个 node 就对应多个 worker_pool;

对应的拓扑图如下：

1.2.2 ordered_wq 数据结构关系

所有 node 对应一个 default worker_pool；

对应的拓扑图如下：

上面的数据结构关系是理解源码的基础，参考上面的拓扑图理解下面的源码分析；

源码分析：内核版本：5.10. 120

init/main.c 开始内核启动先执行workqueue_init_early，再执行workqueue_init

1、start_kernel->workqueue_init_early();

2、start_kernel->arch_call_rest_init->rest_init->kernel_init->kernel_init_freeable->workqueue_init

接着看 workqueue_init_early，//kernel/workqueue.c

void __init workqueue_init_early(void)
{
int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };

// worker_pool 优先级nice有分别为 0 和 -20;
int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
int i, cpu;

BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));

pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

// static struct kmem_cache *pwq_cache;通过slab 申请缓存 pwq_cache;

/* initialize CPU pools */
for_each_possible_cpu(cpu) {
struct worker_pool *pool;

i = 0;

//获取每个cpu 的cpu_worker_pools[2],初始化 worker_pool
for_each_cpu_worker_pool(pool, cpu) {
BUG_ON(init_worker_pool(pool)); //给每个worker_pool 初始化,没有worker
pool->cpu = cpu; //设置cpu
cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
pool->attrs->nice = std_nice[i++]; //这是设置优先级nice 值
pool->node = cpu_to_node(cpu); //这是设置pool 的node

/* alloc pool ID */
mutex_lock(&wq_pool_mutex);
BUG_ON(worker_pool_assign_id(pool)); //申请pool ID
mutex_unlock(&wq_pool_mutex);
}
}

//这个下面是创建 ubound worker_pool 的属性，也分为normal 和high 优先级

/* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;

BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;

/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
* Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
attrs->no_numa = true;
ordered_wq_attrs[i] = attrs;
}

system_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);
}

在上面函数中，创建了以下几个工作队列：这些都是系统中的工作队列

struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_highpri_wq);
struct workqueue_struct *system_long_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_long_wq);
struct workqueue_struct *system_unbound_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
struct workqueue_struct *system_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_power_efficient_wq);
struct workqueue_struct *system_freezable_power_efficient_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq);

workqueue_init_early 中调用 init_worker_pool 初始化每个worker_pool;

static int init_worker_pool(struct worker_pool *pool)
{
raw_spin_lock_init(&pool->lock);
pool->id = -1;
pool->cpu = -1;
pool->node = NUMA_NO_NODE;
pool->flags |= POOL_DISASSOCIATED;
pool->watchdog_ts = jiffies; //软狗的时间赋值，用于检测

INIT_LIST_HEAD(&pool->worklist);

//worker_pool 的 work list，各个 workqueue 把 work 挂载到这个链表上，让 worker_pool 对应的多个 worker 来执行

INIT_LIST_HEAD(&pool->idle_list);

//worker_pool 的 idle worker list，worker 没有活干时，不会马上销毁，先进入 idle 状态备选
hash_init(pool->busy_hash);

//worker_pool 的 busy worker list，worker 正在干活，在执行 work

timer_setup(&pool->idle_timer, idle_worker_timeout, TIMER_DEFERRABLE);

//检查 idle 状态 worker 是否需要 destroy 的 timer,如果300s, worker 一直处于idle,就可以des

timer_setup(&pool->mayday_timer, pool_mayday_timeout, 0);

//在 worker_pool 创建新的 worker 时，检查是否超时的 timer

INIT_LIST_HEAD(&pool->workers);

ida_init(&pool->worker_ida);
INIT_HLIST_NODE(&pool->hash_node);
pool->refcnt = 1;

/* shouldn't fail above this point */
pool->attrs = alloc_workqueue_attrs();
if (!pool->attrs)
return -ENOMEM;
return 0;
}

上面这些函数，初始化了 worker_pool 的normal worker_pool和unbound worker_pool 的属性；

接着看void __init workqueue_init(void) ，这也是初始化函数；

void __init workqueue_init(void)
{
struct workqueue_struct *wq;
struct worker_pool *pool;
int cpu, bkt;

/*
* It'd be simpler to initialize NUMA in workqueue_init_early() but
* CPU to node mapping may not be available that early on some
* archs such as power and arm64. As per-cpu pools created
* previously could be missing node hint and unbound pools NUMA
* affinity, fix them up.
*
* Also, while iterating workqueues, create rescuers if requested.
*/
wq_numa_init();

mutex_lock(&wq_pool_mutex);

for_each_possible_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->node = cpu_to_node(cpu); //再一次给worker_pool node 赋值
}
}

//目前系统中所有的工作队列workqueues ，都更新unbound numa

list_for_each_entry(wq, &workqueues, list) {
wq_update_unbound_numa(wq, smp_processor_id(), true);
WARN(init_rescuer(wq),
"workqueue: failed to create early rescuer for %s",
wq->name);
}

mutex_unlock(&wq_pool_mutex);

//给worker_pool 创建第一个 worker 线程

/* create the initial workers */
for_each_online_cpu(cpu) {
for_each_cpu_worker_pool(pool, cpu) {
pool->flags &= ~POOL_DISASSOCIATED;
BUG_ON(!create_worker(pool));
}
}

//unbound pool 的创建第一个worker 线程，这个hashtable 散列表大小是32

hash_for_each(unbound_pool_hash, bkt, pool, hash_node)
BUG_ON(!create_worker(pool));

wq_online = true;
wq_watchdog_init();
}

接着分析一下 create_worker(pool) ，创建第一个worker 是如何创建的；

static struct worker *create_worker(struct worker_pool *pool)
{
struct worker *worker = NULL;
int id = -1;
char id_buf[16];

/* ID is needed to determine kthread name */
id = ida_simple_get(&pool->worker_ida, 0, 0, GFP_KERNEL);//申请线程ID
if (id < 0)
goto fail;

worker = alloc_worker(pool->node);//根据node 申请worker
if (!worker)
goto fail;

worker->id = id;

if (pool->cpu >= 0)
snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
pool->attrs->nice < 0 ? "H" : "");//normal worker_pool 的线程名
else
snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); // unbound worker_pool 名

//创建内核线程，worker_thread 这是线程函数，"kworker/%s" 是线程名,这是创建，未运行

worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
"kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;

//设置线程的优先级nice 的值；

set_user_nice(worker->task, pool->attrs->nice);

//设置线程的cpu 亲和性
kthread_bind_mask(worker->task, pool->attrs->cpumask);

/* successful, attach the worker to the pool */

//把worker 放到worker_pool
worker_attach_to_pool(worker, pool);

/* start the newly created worker */
raw_spin_lock_irq(&pool->lock);
worker->pool->nr_workers++;

//将worker 初始状态设置成idle
worker_enter_idle(worker);

//wake_up_process 以后，worker 自动leave idle 状态
wake_up_process(worker->task);
raw_spin_unlock_irq(&pool->lock);

return worker;

fail:
if (id >= 0)
ida_simple_remove(&pool->worker_ida, id);
kfree(worker);
return NULL;
}

再回到workqueue_init_early 这个函数，这个函数有以下代码，在alloc_workqueue 中，用WQ_UNBOUND创建 unbound worker_pool,开始展示才艺：

//这个下面是创建 ubound worker_pool 的属性，也分为normal 和high 优先级

/* create default unbound and ordered wq attrs */
for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
struct workqueue_attrs *attrs;

BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
unbound_std_wq_attrs[i] = attrs;

/*
* An ordered wq should have only one pwq as ordering is
* guaranteed by max_active which is enforced by pwqs.
* Turn off NUMA so that dfl_pwq is used for all nodes.
*/
BUG_ON(!(attrs = alloc_workqueue_attrs()));
attrs->nice = std_nice[i];
attrs->no_numa = true;
ordered_wq_attrs[i] = attrs;
}

system_wq = alloc_workqueue("events", 0, 0);
system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
system_long_wq = alloc_workqueue("events_long", 0, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
system_freezable_wq = alloc_workqueue("events_freezable",
WQ_FREEZABLE, 0);
system_power_efficient_wq = alloc_workqueue("events_power_efficient",
WQ_POWER_EFFICIENT, 0);
system_freezable_power_efficient_wq = alloc_workqueue("events_freezable_power_efficient",
WQ_FREEZABLE | WQ_POWER_EFFICIENT,
0);
BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
!system_unbound_wq || !system_freezable_wq ||
!system_power_efficient_wq ||
!system_freezable_power_efficient_wq);

接着分析 alloc_workqueue 函数，申请工作队列时，会创建unbound woker_pool,找到创建的地方了。大致的流程：alloc_workqueue()->alloc_and_link_pwqs() -> apply_workqueue_attrs() -> alloc_unbound_pwq()/numa_pwq_tbl_install()

__printf(1, 4)
struct workqueue_struct *alloc_workqueue(const char *fmt,
unsigned int flags,
int max_active, ...)
{
size_t tbl_size = 0;
va_list args; //为了解析函数参数
struct workqueue_struct *wq;
struct pool_workqueue *pwq;

/*
* Unbound && max_active == 1 used to imply ordered, which is no
* longer the case on NUMA machines due to per-node pools. While
* alloc_ordered_workqueue() is the right way to create an ordered
* workqueue, keep the previous behavior to avoid subtle breakages
* on NUMA.
*/

//如果传进来的flags 是WQ_UNBOUND &&max_active == 1 ，就是 ordered workqueue
if ((flags & WQ_UNBOUND) && max_active == 1)
flags |= __WQ_ORDERED;

/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;

/* allocate wq and format name */
if (flags & WQ_UNBOUND)
tbl_size = nr_node_ids * sizeof(wq->numa_pwq_tbl[0]);// numa_pwq_tbl 是根据node 索引 unbound pwq

//wq 为workqueue_struct 对象申请内存

wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
return NULL;

if (flags & WQ_UNBOUND) {
wq->unbound_attrs = alloc_workqueue_attrs();//申请属性unbound_std_wq_attrs 或者ordered_wq_attrs
if (!wq->unbound_attrs)
goto err_free_wq;
}

va_start(args, max_active);
vsnprintf(wq->name, sizeof(wq->name), fmt, args);
va_end(args);

//pwq 最多放到 worker_pool 中的 work 数max_active ,如果max_active 是0，则是默认的256

max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);

/* init wq */
wq->flags = flags;
wq->saved_max_active = max_active;
mutex_init(&wq->mutex);
atomic_set(&wq->nr_pwqs_to_flush, 0);
INIT_LIST_HEAD(&wq->pwqs);
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
INIT_LIST_HEAD(&wq->maydays);

wq_init_lockdep(wq);
INIT_LIST_HEAD(&wq->list);

// 给 workqueue 分配对应的 pool_workqueuee并且 pool_workqueue 将 workqueue 和 worker_pool 链接起来

if (alloc_and_link_pwqs(wq) < 0)
goto err_unreg_lockdep;

//如果是 WQ_MEM_RECLAIM 类型的 workqueue,创建对应的 rescuer_thread() 内核进程

if (wq_online && init_rescuer(wq) < 0)
goto err_destroy;

//如果是WQ_SYSFS，创建 workqueue 对应的 sysfs 文件

if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
goto err_destroy;

/*
* wq_pool_mutex protects global freeze state and workqueues list.
* Grab it, adjust max_active and add the new @wq to workqueues
* list.
*/
mutex_lock(&wq_pool_mutex);

mutex_lock(&wq->mutex);
for_each_pwq(pwq, wq)
pwq_adjust_max_active(pwq);
mutex_unlock(&wq->mutex);

//将新的 workqueue 加入到全局链表 workqueues 中

list_add_tail_rcu(&wq->list, &workqueues);

mutex_unlock(&wq_pool_mutex);

return wq;

err_unreg_lockdep:
wq_unregister_lockdep(wq);
wq_free_lockdep(wq);
err_free_wq:
free_workqueue_attrs(wq->unbound_attrs);
kfree(wq);
return NULL;
err_destroy:
destroy_workqueue(wq);
return NULL;
}
EXPORT_SYMBOL_GPL(alloc_workqueue);

接下来分析alloc_and_link_pwqs（）：

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
bool highpri = wq->flags & WQ_HIGHPRI; // 工作队列的高优先级
int cpu, ret;

//normal workqueue,通过pool_workqueue 链接 workqueue 和 worker_pool 的过程

if (!(wq->flags & WQ_UNBOUND)) {

//给 workqueue 的每个 cpu 分配对应的 pool_workqueue
wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
if (!wq->cpu_pwqs)
return -ENOMEM;

for_each_possible_cpu(cpu) {
struct pool_workqueue *pwq =
per_cpu_ptr(wq->cpu_pwqs, cpu);
struct worker_pool *cpu_pools =
per_cpu(cpu_worker_pools, cpu);

// 将初始化时已经创建好的 normal worker_pool，赋值给 pool_workqueue

init_pwq(pwq, wq, &cpu_pools[highpri]);

mutex_lock(&wq->mutex);

//将 pool_workqueue 和 workqueue 链接起来
link_pwq(pwq);
mutex_unlock(&wq->mutex);
}
return 0;
}

get_online_cpus(); //上锁

//这是ordered_wq_attrs
if (wq->flags & __WQ_ORDERED) {
ret = apply_workqueue_attrs(wq, ordered_wq_attrs[highpri]);
/* there should only be single pwq for ordering guarantee */
WARN(!ret && (wq->pwqs.next != &wq->dfl_pwq->pwqs_node ||
wq->pwqs.prev != &wq->dfl_pwq->pwqs_node),
"ordering guarantee broken for workqueue %s\n", wq->name);
} else {
ret = apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
put_online_cpus();//释放锁

return ret;
}

接下来分析apply_workqueue_attrs

int apply_workqueue_attrs(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
int ret;

lockdep_assert_cpus_held();

mutex_lock(&wq_pool_mutex);
ret = apply_workqueue_attrs_locked(wq, attrs);
mutex_unlock(&wq_pool_mutex);

return ret;
}

apply_workqueue_attrs_locked（）

static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct apply_wqattrs_ctx *ctx;

/* only unbound workqueues can change attributes */
if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
return -EINVAL;

/* creating multiple pwqs breaks ordering guarantee */
if (!list_empty(&wq->pwqs)) {
if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;

wq->flags &= ~__WQ_ORDERED;
}

ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
return -ENOMEM;

/* the ctx has been prepared successfully, let's commit it */
apply_wqattrs_commit(ctx);
apply_wqattrs_cleanup(ctx);

return 0;
}

apply_wqattrs_prepare（）

/* allocate the attrs and pwqs for later installation */
static struct apply_wqattrs_ctx *
apply_wqattrs_prepare(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{

//根据的 ubound 的 ordered_wq_attrs/unbound_std_wq_attrs,创建对应的 pool_workqueue 和 worker_pool, 其中 worker_pool 不是默认创建好的，是需要动态创建的，对应的 worker 内核进程也要重新创建;创建好的 pool_workqueue 赋值给 pwq_tbl[node]
struct apply_wqattrs_ctx *ctx;
struct workqueue_attrs *new_attrs, *tmp_attrs;
int node;

lockdep_assert_held(&wq_pool_mutex);

ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL);

new_attrs = alloc_workqueue_attrs();
tmp_attrs = alloc_workqueue_attrs();
if (!ctx || !new_attrs || !tmp_attrs)
goto out_free;

/*
* Calculate the attrs of the default pwq.
* If the user configured cpumask doesn't overlap with the
* wq_unbound_cpumask, we fallback to the wq_unbound_cpumask.
*/
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, wq_unbound_cpumask);
if (unlikely(cpumask_empty(new_attrs->cpumask)))
cpumask_copy(new_attrs->cpumask, wq_unbound_cpumask);

/*
* We may create multiple pwqs with differing cpumasks. Make a
* copy of @new_attrs which will be modified and used to obtain
* pools.
*/
copy_workqueue_attrs(tmp_attrs, new_attrs);

/*
* If something goes wrong during CPU up/down, we'll fall back to
* the default pwq covering whole @attrs->cpumask. Always create
* it even if we don't use it immediately.
*/
ctx->dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
if (!ctx->dfl_pwq)
goto out_free;

for_each_node(node) {
if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) {
ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
if (!ctx->pwq_tbl[node])
goto out_free;
} else {
ctx->dfl_pwq->refcnt++;
ctx->pwq_tbl[node] = ctx->dfl_pwq;
}
}

/* save the user configured attrs and sanitize it. */
copy_workqueue_attrs(new_attrs, attrs);
cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
ctx->attrs = new_attrs;

ctx->wq = wq;
free_workqueue_attrs(tmp_attrs);
return ctx;

out_free:
free_workqueue_attrs(tmp_attrs);
free_workqueue_attrs(new_attrs);
apply_wqattrs_cleanup(ctx);
return NULL;
}

接下来分析： alloc_unbound_pwq（）

static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
const struct workqueue_attrs *attrs)
{
struct worker_pool *pool;
struct pool_workqueue *pwq;

lockdep_assert_held(&wq_pool_mutex);

pool = get_unbound_pool(attrs);
if (!pool)
return NULL;

//pwq_cache 这个是在early 初始化时，就申请的pwq 缓存区

pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
if (!pwq) {
put_unbound_pool(pool);
return NULL;
}

init_pwq(pwq, wq, pool);
return pwq;
}

接下来分析init_pwq（）

/* initialize newly alloced @pwq which is associated with @wq and @pool */
static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
struct worker_pool *pool)
{
BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);

memset(pwq, 0, sizeof(*pwq));

//pwq 的中介，发挥链接作用

pwq->pool = pool; //worker_pool
pwq->wq = wq; //workqueue
pwq->flush_color = -1; //未刷新
pwq->refcnt = 1;
INIT_LIST_HEAD(&pwq->delayed_works);
INIT_LIST_HEAD(&pwq->pwqs_node);
INIT_LIST_HEAD(&pwq->mayday_node);
INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
}

3、worker

每个 worker 对应一个 worker_thread() 内核线程，一个 worker_pool 对应一个或者多个 worker。多个 worker 从同一个链表中 worker_pool->worklist 获取 work 进行处理。

所以这其中有几个重点：worker 怎么处理 work；

worker_pool 怎么动态管理 worker 的数量

3.1 worker 处理 work

处理 work 的过程主要在 worker_thread() -> process_one_work() 中处理，我们具体看看代码的实现过程。kernel/workqueue.c: worker_thread() -> process_one_work()

static int worker_thread(void *__worker)
{
struct worker *worker = __worker;
struct worker_pool *pool = worker->pool;

/* tell the scheduler that this is a workqueue worker */
set_pf_worker(true);
woke_up:
raw_spin_lock_irq(&pool->lock);

/* am I supposed to die? */

//worker 是否die
if (unlikely(worker->flags & WORKER_DIE)) {
raw_spin_unlock_irq(&pool->lock);
WARN_ON_ONCE(!list_empty(&worker->entry));
set_pf_worker(false);

set_task_comm(worker->task, "kworker/dying");
ida_simple_remove(&pool->worker_ida, worker->id);
worker_detach_from_pool(worker);
kfree(worker);
return 0;
}

//脱离 idle 状态.被唤醒之前 worker 都是 idle 状态

worker_leave_idle(worker);
recheck:

//如果需要本 worker 继续执行则继续，否则进入 idle 状态// need more worker 的条件： //(pool->worklist != 0) && (pool->nr_running == 0)// worklist 上有 work 需要执行，并且现在

//没有处于 running 的 work
/* no more worker necessary? */
if (!need_more_worker(pool))
goto sleep;

/* do we need to manage? */

//如果 (pool->nr_idle == 0)，则启动创建更多的 worker// 说明 idle 队列中已经没有备用 worker 了，先创建一些 worker 备用
if (unlikely(!may_start_working(pool)) && manage_workers(worker))
goto recheck;

/*
* ->scheduled list can only be filled while a worker is
* preparing to process a work or actually processing it.
* Make sure nobody diddled with it while I was sleeping.
*/
WARN_ON_ONCE(!list_empty(&worker->scheduled));

/*
* Finish PREP stage. We're guaranteed to have at least one idle
* worker or that someone else has already assumed the manager
* role. This is where @worker starts participating in concurrency
* management if applicable and concurrency management is restored
* after being rebound. See rebind_workers() for details.
*/
worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

do {
struct work_struct *work =
list_first_entry(&pool->worklist,
struct work_struct, entry);

pool->watchdog_ts = jiffies;

if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
/* optimization path, not strictly necessary */
process_one_work(worker, work);
if (unlikely(!list_empty(&worker->scheduled)))
process_scheduled_works(worker);
} else {

//执行系统特意 scheduled 给某个 worker 的 work.普通的 work 是放在池子的公共 list 中的 //pool->worklist.只有一些特殊的 work 被特意派送给某个 worker 的 worker->scheduled

//包括：1、执行 flush_work 时插入的 barrier work；2、collision 时从其他 worker 推送到本 //worker 的 work
move_linked_works(work, &worker->scheduled, NULL);
process_scheduled_works(worker);
}

//worker keep_working 的条件：pool->worklist 不为空 && (pool->nr_running <= 1)
} while (keep_working(pool));

worker_set_flags(worker, WORKER_PREP);
sleep:
/*
* pool->lock is held and there's no work to process and no need to
* manage, sleep. Workers are woken up only while holding
* pool->lock or from local cpu, so setting the current state
* before releasing pool->lock is enough to prevent losing any
* event.
*/
worker_enter_idle(worker);
__set_current_state(TASK_IDLE);
raw_spin_unlock_irq(&pool->lock);
schedule();
goto woke_up;
}

接着分析process_one_work （）

static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
int work_color;
struct worker *collision;
#ifdef CONFIG_LOCKDEP
/*
* It is permissible to free the struct work_struct from
* inside the function that is called from it, this we need to
* take into account for lockdep too. To avoid bogus "held
* lock freed" warnings as well as problems when looking into
* work->lockdep_map, make a copy and use that here.
*/
struct lockdep_map lockdep_map;

lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
/* ensure we're on the correct CPU */
WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
raw_smp_processor_id() != pool->cpu);

/*
* A single work shouldn't be executed concurrently by
* multiple workers on a single cpu. Check whether anyone is
* already processing the work. If so, defer the work to the
* currently executing one.
*/

//如果 work 已经在 worker_pool 的其他 worker 上执行，// 将 work 放入对应 worker 的 //scheduled 队列中延后执行
collision = find_worker_executing_work(pool, work);
if (unlikely(collision)) {
move_linked_works(work, &collision->scheduled, NULL);
return;
}

/* claim and dequeue */
debug_work_deactivate(work);

//将 worker 加入 busy 队列 pool->busy_hash
hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
worker->current_work = work;
worker->current_func = work->func;
worker->current_pwq = pwq;
work_color = get_work_color(work);

/*
* Record wq name for cmdline and debug reporting, may get
* overridden through set_worker_desc().
*/
strscpy(worker->desc, pwq->wq->name, WORKER_DESC_LEN);

list_del_init(&work->entry);

/*
* CPU intensive works don't participate in concurrency management.
* They're the scheduler's responsibility. This takes @worker out
* of concurrency management and the next code block will chain
* execution of the pending work items.
*/

//如果 work 所在的 wq 是 cpu 密集型的 WQ_CPU_INTENSIVE;则当前 work 的执行脱离 //worker_pool 的动态调度，成为一个独立的线程
if (unlikely(cpu_intensive))
worker_set_flags(worker, WORKER_CPU_INTENSIVE);

/*
* Wake up another worker if necessary. The condition is always
* false for normal per-cpu workers since nr_running would always
* be >= 1 at this point. This is used to chain execution of the
* pending work items for WORKER_NOT_RUNNING workers such as the
* UNBOUND and CPU_INTENSIVE ones.
*/
if (need_more_worker(pool))
wake_up_worker(pool);

/*
* Record the last pool and clear PENDING which should be the last
* update to @work. Also, do this inside @pool->lock so that
* PENDING and queued state changes happen together while IRQ is
* disabled.
*/
set_work_pool_and_clear_pending(work, pool->id);

raw_spin_unlock_irq(&pool->lock);

lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);

/*
* Strictly speaking we should mark the invariant state without holding
* any locks, that is, before these two lock_map_acquire()'s.
*
* However, that would result in:
*
* A(W1)
* WFC(C)
* A(W1)
* C(C)
*
* Which would create W1->C->W1 dependencies, even though there is no
* actual deadlock possible. There are two solutions, using a
* read-recursive acquire on the work(queue) 'locks', but this will then
* hit the lockdep limitation on recursive locks, or simply discard
* these locks.
*
* AFAICT there is no possible deadlock scenario between the
* flush_work() and complete() primitives (except for single-threaded
* workqueues), so hiding them isn't a problem.
*/
lockdep_invariant_state(true);
trace_workqueue_execute_start(work);

//执行 work 函数worker->current_func(work)
worker->current_func(work);
/*
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
*/
trace_workqueue_execute_end(work, worker->current_func);
lock_map_release(&lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);

if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
" last function: %ps\n",
current->comm, preempt_count(), task_pid_nr(current),
worker->current_func);
debug_show_held_locks(current);
dump_stack();
}

/*
* The following prevents a kworker from hogging CPU on !PREEMPTION
* kernels, where a requeueing work item waiting for something to
* happen could deadlock with stop_machine as such work item could
* indefinitely requeue itself while all other CPUs are trapped in
* stop_machine. At the same time, report a quiescent RCU state so
* the same condition doesn't freeze RCU.
*/
cond_resched();

raw_spin_lock_irq(&pool->lock);

/* clear cpu intensive status */
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);

/* tag the worker for identification in schedule() */
worker->last_func = worker->current_func;

/* we're done with it, release */
hash_del(&worker->hentry);
worker->current_work = NULL;
worker->current_func = NULL;
worker->current_pwq = NULL;
pwq_dec_nr_in_flight(pwq, work_color);
}

3.2 worker_pool 动态管理 worker

worker_pool 怎么来动态增减 worker，这部分的算法的核心。其思想如下：worker_pool 中的 worker 有 3 种状态：idle、running、suspend；

如果 worker_pool 中有 work 需要处理，保持至少一个 running worker 来处理；

running worker 在处理 work 的过程中进入了阻塞 suspend 状态，为了保持其他 work 的执行，需要唤醒新的 idle worker 来处理 work；

如果有 work 需要执行且 running worker 大于 1 个，会让多余的 running worker 进入 idle 状态；

如果没有 work 需要执行，会让所有 worker 进入 idle 状态；

如果创建的 worker 过多，destroy_worker 在 300s(IDLE_WORKER_TIMEOUT) 时间内没有再次运行的 idle worker。

详细代码可以参考上节 worker_thread() -> process_one_work() 的分析。

为了追踪 worker 的 running 和 suspend 状态，用来动态调整 worker 的数量。5.10 的内核版本在主调度器schedule() 增加了对worker 的状态检查。

看一下主调度器函数：schedule()

asmlinkage __visible void __sched schedule(void)
{
struct task_struct *tsk = current;

sched_submit_work(tsk); //增加的第一函数
do {
preempt_disable();
__schedule(false, false);
sched_preempt_enable_no_resched();
} while (need_resched());
sched_update_worker(tsk); //增加的第二个函数
}
EXPORT_SYMBOL(schedule);

第一个函数sched_submit_work(tsk);

static inline void sched_submit_work(struct task_struct *tsk)
{
unsigned int task_flags;

if (!tsk->state)
return;

task_flags = tsk->flags;
/*
* If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency.
* As this function is called inside the schedule() context,
* we disable preemption to avoid it calling schedule() again
* in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it.
*/
if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk); //检查worker 的状态
else
io_wq_worker_sleeping(tsk);
preempt_enable_no_resched();
}

/*
* If we are going to sleep and we have plugged IO queued,
* make sure to submit it to avoid deadlocks.
*/
if (blk_needs_flush_plug(tsk))
blk_schedule_flush_plug(tsk);
}

接着分析 wq_worker_sleeping（）；追踪 worker 从 running 进入 sleep 状态

void wq_worker_sleeping(struct task_struct *task)
{
struct worker *next, *worker = kthread_data(task);
struct worker_pool *pool;

/*
* Rescuers, which may not have all the fields set up like normal
* workers, also reach here, let's not access anything before
* checking NOT_RUNNING.
*/
if (worker->flags & WORKER_NOT_RUNNING)
return;

pool = worker->pool;

/* Return if preempted before wq_worker_running() was reached */
if (worker->sleeping)
return;

worker->sleeping = 1;
raw_spin_lock_irq(&pool->lock);

/*
* The counterpart of the following dec_and_test, implied mb,
* worklist not empty test sequence is in insert_work().
* Please read comment there.
*
* NOT_RUNNING is clear. This means that we're bound to and
* running on the local cpu w/ rq lock held and preemption
* disabled, which in turn means that none else could be
* manipulating idle_list, so dereferencing idle_list without pool
* lock is safe.
*/
if (atomic_dec_and_test(&pool->nr_running) &&
!list_empty(&pool->worklist)) {
next = first_idle_worker(pool);
if (next)
wake_up_process(next->task);
}
raw_spin_unlock_irq(&pool->lock);
}

第二个函数sched_update_worker(tsk);

static void sched_update_worker(struct task_struct *tsk)
{
if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
else
io_wq_worker_running(tsk);
}
}

a worker is running again

void wq_worker_running(struct task_struct *task)
{
struct worker *worker = kthread_data(task);

if (!worker->sleeping)
return;

/*
* If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
* and the nr_running increment below, we may ruin the nr_running reset
* and leave with an unexpected pool->nr_running == 1 on the newly unbound
* pool. Protect against such race.
*/
preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
preempt_enable();
worker->sleeping = 0;
}

这里 worker_pool 的调度思想是：如果有 work 需要处理，保持一个 running 状态的 worker 处理，不多也不少。

但是这里有一个问题如果 work 是 CPU 密集型的，它虽然也没有进入 suspend 状态，但是会长时间的占用 CPU，让后续的 work 阻塞太长时间。

为了解决这个问题，设计了 WQ_CPU_INTENSIVE，如果一个 wq 声明自己是 CPU_INTENSIVE，则让当前 worker 脱离动态调度，像是进入了 suspend 状态，那么会创建新的 worker，后续的 work 会得到执行。kernel/workqueue.c:

static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct worker_pool *pool = worker->pool;
bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
int work_color;
struct worker *collision;

。。。

/*
* CPU intensive works don't participate in concurrency management.
* They're the scheduler's responsibility. This takes @worker out
* of concurrency management and the next code block will chain
* execution of the pending work items.
*/
//设置当前 worker 的 WORKER_CPU_INTENSIVE 标志.nr_running 会被减 1, 对 //worker_pool 来说，当前 worker 相当于进入了 suspend 状态

if (unlikely(cpu_intensive))
worker_set_flags(worker, WORKER_CPU_INTENSIVE);

//接上一步，判断是否需要唤醒新的 worker 来处理 work
/*
* Wake up another worker if necessary. The condition is always
* false for normal per-cpu workers since nr_running would always
* be >= 1 at this point. This is used to chain execution of the
* pending work items for WORKER_NOT_RUNNING workers such as the
* UNBOUND and CPU_INTENSIVE ones.
*/
if (need_more_worker(pool))
wake_up_worker(pool);

。。。

lockdep_invariant_state(true);
trace_workqueue_execute_start(work);
worker->current_func(work); // 执行work
...

/* clear cpu intensive status */

// 执行完，清理当前 worker 的 WORKER_CPU_INTENSIVE 标志,当前 worker 重新进入 //running 状态
if (unlikely(cpu_intensive))
worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
。。。
}

worker_set_flags 和 worker_clr_flags 通过标志，改变worker 的状态似得，请看下面atomic_dec(&pool->nr_running)和 atomic_inc(&pool->nr_running);

/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
*
* Set @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;

WARN_ON_ONCE(worker->task != current);

/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
!(worker->flags & WORKER_NOT_RUNNING)) {
atomic_dec(&pool->nr_running);
}

worker->flags |= flags;
}

/**
* worker_clr_flags - clear worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to clear
*
* Clear @flags in @worker->flags and adjust nr_running accordingly.
*
* CONTEXT:
* raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;

WARN_ON_ONCE(worker->task != current);

worker->flags &= ~flags;

/*
* If transitioning out of NOT_RUNNING, increment nr_running. Note
* that the nested NOT_RUNNING is not a noop. NOT_RUNNING is mask
* of multiple flags, not a single flag.
*/
if ((flags & WORKER_NOT_RUNNING) && (oflags & WORKER_NOT_RUNNING))
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&pool->nr_running);
}

4、 CPU hotplug 处理

从上几节可以看到，系统会创建和 CPU 绑定的 normal worker_pool 和不绑定 CPU 的 unbound worker_pool，worker_pool 又会动态的创建 worker。

那么在 CPU hotplug 的时候，会怎么样动态的处理 worker_pool 和 worker 呢？来看具体的代码分析：kernel/workqueue.c:

workqueue_online_cpu（）/workqueue_offline_cpu()

先从CPU的boot processor state 函数说起，看下面一个定义：

// kernel/cpu.c

/* Boot processor state steps */
static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_OFFLINE] = {
.name = "offline",
.startup.single = NULL,
.teardown.single = NULL,
},
#ifdef CONFIG_SMP
[CPUHP_CREATE_THREADS]= {
.name = "threads:prepare",
.startup.single = smpboot_create_threads,
.teardown.single = NULL,
.cant_stop = true,
},
[CPUHP_PERF_PREPARE] = {
.name = "perf:prepare",
.startup.single = perf_event_init_cpu,
.teardown.single = perf_event_exit_cpu,
},
[CPUHP_RANDOM_PREPARE] = {
.name = "random:prepare",
.startup.single = random_prepare_cpu,
.teardown.single = NULL,
},
[CPUHP_WORKQUEUE_PREP] = {
.name = "workqueue:prepare",
.startup.single = workqueue_prepare_cpu,
.teardown.single = NULL,
},
....

[CPUHP_AP_WORKQUEUE_ONLINE] = {
.name = "workqueue:online",
.startup.single = workqueue_online_cpu,
.teardown.single = workqueue_offline_cpu,
},
...

}

上面定义了CPU 启动时的回调函数，根据startup.single 和teardown.single 信号来进行区分。

下面看workqueue_online_cpu 和workqueue_offline_cpu

int workqueue_online_cpu(unsigned int cpu)
{
struct worker_pool *pool;
struct workqueue_struct *wq;
int pi;

mutex_lock(&wq_pool_mutex);

for_each_pool(pool, pi) {
mutex_lock(&wq_pool_attach_mutex);

if (pool->cpu == cpu)

//如果pool和当前cpu相等，就woker重新加载到pool,然后唤醒工作
rebind_workers(pool);
else if (pool->cpu < 0)
restore_unbound_workers_cpumask(pool, cpu);

mutex_unlock(&wq_pool_attach_mutex);
}

/* update NUMA affinity of unbound workqueues */
list_for_each_entry(wq, &workqueues, list)
wq_update_unbound_numa(wq, cpu, true);

mutex_unlock(&wq_pool_mutex);
return 0;
}

int workqueue_offline_cpu(unsigned int cpu)
{
struct workqueue_struct *wq;

/* unbinding per-cpu workers should happen on the local CPU */
if (WARN_ON(cpu != smp_processor_id()))
return -1;

//unbind_workers 函数把worker设置成WORKER_UNBOUND,然后设置这些worker 的cpu

//亲和性，在调度到其他cpu 上

unbind_workers(cpu);

/* update NUMA affinity of unbound workqueues */
mutex_lock(&wq_pool_mutex);
list_for_each_entry(wq, &workqueues, list)
wq_update_unbound_numa(wq, cpu, false);
mutex_unlock(&wq_pool_mutex);

return 0;
}

我把rebind_workers 和unbind_wokers 的函数实现的功能写在上面了，具体实现代码粘贴到下面，不具体分析代码；

rebind_workers（）和unbind_wokers（）

/*
* CPU hotplug.
*
* There are two challenges in supporting CPU hotplug. Firstly, there
* are a lot of assumptions on strong associations among work, pwq and
* pool which make migrating pending and scheduled works very
* difficult to implement without impacting hot paths. Secondly,
* worker pools serve mix of short, long and very long running works making
* blocked draining impractical.
*
* This is solved by allowing the pools to be disassociated from the CPU
* running as an unbound one and allowing it to be reattached later if the
* cpu comes back online.
*/

static void unbind_workers(int cpu)
{
struct worker_pool *pool;
struct worker *worker;

for_each_cpu_worker_pool(pool, cpu) {
mutex_lock(&wq_pool_attach_mutex);
raw_spin_lock_irq(&pool->lock);

/*
* We've blocked all attach/detach operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
* except for the ones which are still executing works from
* before the last CPU down must be on the cpu. After
* this, they may become diasporas.
*/
for_each_pool_worker(worker, pool)
worker->flags |= WORKER_UNBOUND;

pool->flags |= POOL_DISASSOCIATED;

raw_spin_unlock_irq(&pool->lock);

for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0);

mutex_unlock(&wq_pool_attach_mutex);

/*
* Call schedule() so that we cross rq->lock and thus can
* guarantee sched callbacks see the %WORKER_UNBOUND flag.
* This is necessary as scheduler callbacks may be invoked
* from other cpus.
*/
schedule();

/*
* Sched callbacks are disabled now. Zap nr_running.
* After this, nr_running stays zero and need_more_worker()
* and keep_working() are always true as long as the
* worklist is not empty. This pool now behaves as an
* unbound (in terms of concurrency management) pool which
* are served by workers tied to the pool.
*/
atomic_set(&pool->nr_running, 0);

/*
* With concurrency management just turned off, a busy
* worker blocking could lead to lengthy stalls. Kick off
* unbound chain execution of currently pending work items.
*/
raw_spin_lock_irq(&pool->lock);
wake_up_worker(pool);
raw_spin_unlock_irq(&pool->lock);
}
}

/**
* rebind_workers - rebind all workers of a pool to the associated CPU
* @pool: pool of interest
*
* @pool->cpu is coming online. Rebind all workers to the CPU.
*/

static void rebind_workers(struct worker_pool *pool)
{
struct worker *worker;

lockdep_assert_held(&wq_pool_attach_mutex);

/*
* Restore CPU affinity of all workers. As all idle workers should
* be on the run-queue of the associated CPU before any local
* wake-ups for concurrency management happen, restore CPU affinity
* of all workers first and then clear UNBOUND. As we're called
* from CPU_ONLINE, the following shouldn't fail.
*/
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
pool->attrs->cpumask) < 0);

raw_spin_lock_irq(&pool->lock);

pool->flags &= ~POOL_DISASSOCIATED;

for_each_pool_worker(worker, pool) {
unsigned int worker_flags = worker->flags;

/*
* A bound idle worker should actually be on the runqueue
* of the associated CPU for local wake-ups targeting it to
* work. Kick all idle workers so that they migrate to the
* associated CPU. Doing this in the same loop as
* replacing UNBOUND with REBOUND is safe as no worker will
* be bound before @pool->lock is released.
*/
if (worker_flags & WORKER_IDLE)
wake_up_process(worker->task);

/*
* We want to clear UNBOUND but can't directly call
* worker_clr_flags() or adjust nr_running. Atomically
* replace UNBOUND with another NOT_RUNNING flag REBOUND.
* @worker will clear REBOUND using worker_clr_flags() when
* it initiates the next execution cycle thus restoring
* concurrency management. Note that when or whether
* @worker clears REBOUND doesn't affect correctness.
*
* WRITE_ONCE() is necessary because @worker->flags may be
* tested without holding any lock in
* wq_worker_running(). Without it, NOT_RUNNING test may
* fail incorrectly leading to premature concurrency
* management operations.
*/
WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
worker_flags |= WORKER_REBOUND;
worker_flags &= ~WORKER_UNBOUND;
WRITE_ONCE(worker->flags, worker_flags);
}

raw_spin_unlock_irq(&pool->lock);
}

5、 flush_workqueue()

这一部分的逻辑，wq->work_color、wq->flush_color 换来换去的逻辑实在看的头晕。看不懂暂时不想看，放着以后看吧，或者有谁看懂了教我一下。

6、queue_work()

//include/linux/workqueue.h

static inline bool queue_work(struct workqueue_struct *wq,
struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

将 work 压入到 workqueue 当中。

kernel/workqueue.c:

queue_work_on() 这个函数被封装了

queue_work_node() 这个函数用的少，就在kernell/async.c 中调用过

/**
* queue_work_on - queue work on specific cpu
* @cpu: CPU number to execute work on
* @wq: workqueue to use
* @work: work to queue
*
* We queue the work to a specific CPU, the caller must ensure it
* can't go away.
*
* Return: %false if @work was already on a queue, %true otherwise.
*/
bool queue_work_on(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
bool ret = false;
unsigned long flags;

local_irq_save(flags);

if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
__queue_work(cpu, wq, work);
ret = true;
}

local_irq_restore(flags);
return ret;
}
EXPORT_SYMBOL(queue_work_on);

接着看__queue_work

static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
struct worker_pool *last_pool;
struct list_head *worklist;
unsigned int work_flags;
unsigned int req_cpu = cpu;

/*
* While a work item is PENDING && off queue, a task trying to
* steal the PENDING will busy-loop waiting for it to either get
* queued or lose PENDING. Grabbing PENDING and queueing should
* happen with IRQ disabled.
*/
lockdep_assert_irqs_disabled();

/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
rcu_read_lock();
retry:

//如果没有指定 cpu，则使用当前 cpu;raw_smp_processor_id
/* pwq which will be used unless @work is executing elsewhere */

//对于 unbound wq，使用当前 cpu 对应 node 的 worker_pool
if (wq->flags & WQ_UNBOUND) {
if (req_cpu == WORK_CPU_UNBOUND)
cpu = wq_select_unbound_cpu(raw_smp_processor_id());
pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
} else {

//对于 normal wq，使用当前 cpu 对应的 normal worker_pool
if (req_cpu == WORK_CPU_UNBOUND)
cpu = raw_smp_processor_id();
pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
}

/*
* If @work was previously on a different pool, it might still be
* running there, in which case the work needs to be queued on that
* pool to guarantee non-reentrancy.
*/

//如果 work 在其他 worker 上正在被执行，把 work 压到对应的 worker 上去// 避免 work 出

//现//重入的问题
last_pool = get_work_pool(work);
if (last_pool && last_pool != pwq->pool) {
struct worker *worker;

raw_spin_lock(&last_pool->lock);

worker = find_worker_executing_work(last_pool, work);

if (worker && worker->current_pwq->wq == wq) {
pwq = worker->current_pwq;
} else {
/* meh... not running there, queue here */
raw_spin_unlock(&last_pool->lock);
raw_spin_lock(&pwq->pool->lock);
}
} else {
raw_spin_lock(&pwq->pool->lock);
}

/*
* pwq is determined and locked. For unbound pools, we could have
* raced with pwq release and it could already be dead. If its
* refcnt is zero, repeat pwq selection. Note that pwqs never die
* without another pwq replacing it in the numa_pwq_tbl or while
* work items are executing on it, so the retrying is guaranteed to
* make forward-progress.
*/
if (unlikely(!pwq->refcnt)) {
if (wq->flags & WQ_UNBOUND) {
raw_spin_unlock(&pwq->pool->lock);
cpu_relax();
goto retry;
}
/* oops */
WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
wq->name, cpu);
}

/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);

if (WARN_ON(!list_empty(&work->entry)))
goto out;

pwq->nr_in_flight[pwq->work_color]++;
work_flags = work_color_to_flags(pwq->work_color);

// 如果还没有达到 max_active，将 work 挂载到 pool->worklist

if (likely(pwq->nr_active < pwq->max_active)) {
trace_workqueue_activate_work(work);
pwq->nr_active++;
worklist = &pwq->pool->worklist;
if (list_empty(worklist))
pwq->pool->watchdog_ts = jiffies;
} else {

否则，将 work 挂载到临时队列 pwq->delayed_works
work_flags |= WORK_STRUCT_DELAYED;
worklist = &pwq->delayed_works;
}

debug_work_activate(work);

// 将 work 压入 worklist 当中
insert_work(pwq, work, worklist, work_flags);

out:
raw_spin_unlock(&pwq->pool->lock);
rcu_read_unlock();
}

7、 flush_work()

flush 某个 work，确保 work 执行完成。

怎么判断异步的 work 已经执行完成？这里面使用了一个技巧：在目标 work 的后面插入一个新的 work wq_barrier，如果 wq_barrier 执行完成，那么目标 work 肯定已经执行完成。kernel/workqueue.c:

bool flush_work(struct work_struct *work)
{
return __flush_work(work, false);
}
EXPORT_SYMBOL_GPL(flush_work);

static bool __flush_work(struct work_struct *work, bool from_cancel)
{
struct wq_barrier barr;

if (WARN_ON(!wq_online))
return false;

if (WARN_ON(!work->func))
return false;

if (!from_cancel) {
lock_map_acquire(&work->lockdep_map);
lock_map_release(&work->lockdep_map);
}

if (start_flush_work(work, &barr, from_cancel)) {

// 等待 barr work 执行完成的信号
wait_for_completion(&barr.done);
destroy_work_on_stack(&barr.work);
return true;
} else {
return false;
}
}

在目标 work 的后面插入一个新的 work wq_barrier，如果 wq_barrier 执行完成，那么目标 work 肯定已经执行完成

接着看 start_flush_work（）函数

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
bool from_cancel)
{
struct worker *worker = NULL;
struct worker_pool *pool;
struct pool_workqueue *pwq;

might_sleep();

rcu_read_lock();
pool = get_work_pool(work);

//如果 work 所在 worker_pool 为 NULL，说明 work 已经执行完
if (!pool) {
rcu_read_unlock();
return false;
}

raw_spin_lock_irq(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {

//如果 work 所在 pwq 指向的 worker_pool 不等于上一步得到的 worker_pool，说明 work

//已经执行完
if (unlikely(pwq->pool != pool))
goto already_gone;
} else {

//如果 work 所在 pwq 为 NULL，并且也没有在当前执行的 work 中，说明 work 已经执行完
worker = find_worker_executing_work(pool, work);
if (!worker)
goto already_gone;
pwq = worker->current_pwq;
}

check_flush_dependency(pwq->wq, work);

//如果 work 没有执行完，向 work 的后面插入 barr work

insert_wq_barrier(pwq, barr, work, worker);
raw_spin_unlock_irq(&pool->lock);

/*
* Force a lock recursion deadlock when using flush_work() inside a
* single-threaded or rescuer equipped workqueue.
*
* For single threaded workqueues the deadlock happens when the work
* is after the work issuing the flush_work(). For rescuer equipped
* workqueues the deadlock happens when the rescuer stalls, blocking
* forward progress.
*/
if (!from_cancel &&
(pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)) {
lock_map_acquire(&pwq->wq->lockdep_map);
lock_map_release(&pwq->wq->lockdep_map);
}
rcu_read_unlock();
return true;
already_gone:
raw_spin_unlock_irq(&pool->lock);
rcu_read_unlock();
return false;
}

insert_wq_barrier

static void insert_wq_barrier(struct pool_workqueue *pwq,
struct wq_barrier *barr,
struct work_struct *target, struct worker *worker)
{
struct list_head *head;
unsigned int linked = 0;

/*
* debugobject calls are safe here even with pool->lock locked
* as we know for sure that this will not trigger any of the
* checks and call back into the fixup functions where we
* might deadlock.
*/

//barr work 的执行函数 wq_barrier_func()
INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));

init_completion_map(&barr->done, &target->lockdep_map);

barr->task = current;

/*
* If @target is currently being executed, schedule the
* barrier to the worker; otherwise, put it after @target.
*/

//如果 work 当前在 worker 中执行，则 barr work 插入 scheduled 队列
if (worker)
head = worker->scheduled.next;
else {

// 否则，则 barr work 插入正常的 worklist 队列中，插入位置在目标 work 后面// 并且置上 WORK_STRUCT_LINKED 标志
unsigned long *bits = work_data_bits(target);

head = target->entry.next;
/* there can already be other linked works, inherit and set */
linked = *bits & WORK_STRUCT_LINKED;
__set_bit(WORK_STRUCT_LINKED_BIT, bits);
}

debug_work_activate(&barr->work);
insert_work(pwq, &barr->work, head,
work_color_to_flags(WORK_NO_COLOR) | linked);
}

wq_barrier_func()

static void wq_barrier_func(struct work_struct *work)
{
struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
complete(&barr->done);// 发送完成信号
}

9、Workqueue 对外接口函数

9.1schedule_work()

把 work 压入系统默认 wq: system_wq，WORK_CPU_UNBOUND 指定 worker 为当前 CPU 绑定的 normal worker_pool 创建的 worker。

kernel/workqueue.c:

static inline bool schedule_work(struct work_struct *work)
{
return queue_work(system_wq, work);
}

9.2 schedule_work_on()

在 schedule_work() 基础上，可以指定 work 运行的 CPU

static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
return queue_work_on(cpu, system_wq, work);
}

9.3 schedule_delayed_work

启动一个 timer，在 timer 定时到了以后调用 delayed_work_timer_fn() 把 work 压入系统默认 wq system_wq。kernel/workqueue.c

schedule_delayed_work->queue_delayed_work->queue_delayed_work_on

10、写一个使用中断下半部workqueue 的驱动demo

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/interrupt.h>
#include <linux/workqueue.h>

static struct workqueue_struct *my_workqueue;
static struct work_struct my_work;

static void my_work_function(struct work_struct *work) {
printk(KERN_INFO "Workqueue function is executed in the bottom half\n");
}

static irqreturn_t my_interrupt_handler(int irq, void *dev_id) {
// Schedule the work to be executed in the bottom half
queue_work(my_workqueue, &my_work); //中断下半部

return IRQ_HANDLED;
}

static int __init my_init(void) {
printk(KERN_INFO "Initializing workqueue example for interrupt bottom half\n");

// Create a workqueue
my_workqueue = alloc_workqueue("my_workqueue");
if (!my_workqueue) {
printk(KERN_ERR "Failed to create workqueue\n");
return -ENOMEM;
}

// Initialize work
INIT_WORK(&my_work, my_work_function);

// Simulate an interrupt (replace with your actual interrupt setup)
int irq = 1; // Replace with your actual interrupt number
if (request_irq(irq, my_interrupt_handler, IRQF_SHARED, "my_interrupt", NULL)) {
printk(KERN_ERR "Failed to register interrupt handler\n");
destroy_workqueue(my_workqueue);
return -EINVAL;
}

return 0;
}

static void __exit my_exit(void) {
// Flush and destroy the workqueue
flush_workqueue(my_workqueue);
destroy_workqueue(my_workqueue);

printk(KERN_INFO "Exiting workqueue example for interrupt bottom half\n");
}

module_init(my_init);
module_exit(my_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("Your Name");
MODULE_DESCRIPTION("A simple example of using workqueue in the bottom half of an interrupt handler");