linux workqueue

Workqueue 是内核里面很重要的一个机制,特别是内核驱动,一般的小型任务 (work) 都不会自己起一个线程来处理,而是扔到 Workqueue 中处理。Workqueue 的主要工作就是用进程上下文来处理内核中大量的小任务。

所以 Workqueue 的主要设计思想:一个是并行,多个 work 不要相互阻塞;另外一个是节省资源,多个 work 尽量共享资源 ( 进程、调度、内存 ),不要造成系统过多的资源浪费。

为了实现的设计思想,workqueue 的设计实现也更新了很多版本。最新的 workqueue 实现叫做 CMWQ(Concurrency Managed Workqueue),也就是用更加智能的算法来实现“并行和节省”。新版本的 workqueue 创建函数改成 alloc_workqueue(),旧版本的函数 create_workqueue() 逐渐会被被废弃。

在讲解工作队列之前,先看一下工作队列是如何使用的,对工作队列先有一个大概的认识。

1 工作队列的使用

下面给出一个简单示例:

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/delay.h>
#include <linux/workqueue.h>

struct workqueue_struct *workqueue_test;

struct work_struct work_test;

void work_test_func(struct work_struct *work)
{
    printk("%s()\n", __func__);

    //mdelay(1000);
    //queue_work(workqueue_test, &work_test);
}


static int test_init(void)
{
    printk("Hello,world!\n");

    /* 1. 自己创建一个workqueue, 中间参数为0,默认配置 */
    workqueue_test = alloc_workqueue("workqueue_test", 0, 0);

    /* 2. 初始化一个工作项,并添加自己实现的函数 */
    INIT_WORK(&work_test, work_test_func);

    /* 3. 将自己的工作项添加到指定的工作队列去, 同时唤醒相应线程处理 */
    queue_work(workqueue_test, &work_test);
    
    return 0;
}

static void test_exit(void)
{
    printk("Goodbye,cruel world!\n");
    destroy_workqueue(workqueue_test);
}

module_init(test_init);
module_exit(test_exit);

MODULE_AUTHOR("Vedic <FZKmxcz@163.com>");
MODULE_LICENSE("Dual BSD/GPL");

当然内核已经为我们创建了几个工作队列, 我们可以直接将自己的工作项挂到相应的队列即可:

 所以代码可以改为:

static int test_init(void)
{
    printk("Hello,world!\n");

    /* 2. 初始化一个工作项,并添加自己实现的函数 */
    INIT_WORK(&work_test, work_test_func);

    /* 3. 将自己的工作项添加到指定的工作队列去, 同时唤醒相应线程处理 */
    queue_work(system_wq, &work_test);
    
    return 0;
}

如果workqueue对象是 system_wq, 可以使用另一个封装函数schedule_work(&work_test)
static inline bool schedule_work(struct work_struct *work)
{
	  return queue_work(system_wq, work);
}

2 工作队列的几个基本的概念

关于 workqueue 中几个概念都是 work 相关的数据结构非常容易混淆,大概可以这样来理解:

  • work :工作。
  • workqueue :工作的集合。workqueue 和 work 是一对多的关系。
  • worker :工人。在代码中 worker 对应一个 work_thread() 内核线程。
  • worker_pool:工人的集合。worker_pool 和 worker 是一对多的关系。
  • pwq(pool_workqueue):中间人 / 中介,负责建立起 workqueue 和 worker_pool 之间的关系。workqueue 和 pwq 是一对多的关系,pwq 和 worker_pool 是一对一的关系。

    

                                                                              normal wq_topology

2.1 worker_pool

每个执行 work 的线程叫做 worker,一组 worker 的集合叫做 worker_pool。CMWQ 的精髓就在 worker_pool 里面 worker 的动态增减管理上 manage_workers()

CMWQ 对 worker_pool 分成两类:

  • normal worker_pool,给通用的 workqueue 使用;
  • unbound worker_pool,给 WQ_UNBOUND 类型的的 workqueue 使用;

2.1.1 normal worker_pool

默认 work 是在 normal worker_pool 中处理的。系统的规划是每个 CPU 创建两个 normal worker_pool:一个 normal 优先级 (nice=0)、一个高优先级 (nice=HIGHPRI_NICE_LEVEL),对应创建出来的 worker 的进程 nice 不一样。

每个 worker 对应一个 worker_thread() 内核线程,一个 worker_pool 包含一个或者多个 worker,worker_pool 中 worker 的数量是根据 worker_pool 中 work 的负载来动态增减的。

                               

                                                                               normal worker_pool

对应的拓扑图如下:

     

                                                                        normal worker_pool topology

2.1.2 unbound worker_pool

大部分的 work 都是通过 normal worker_pool 来执行的 ( 例如通过 schedule_work()schedule_work_on() 压入到系统 workqueue(system_wq) 中的 work),最后都是通过 normal worker_pool 中的 worker 来执行的。这些 worker 是和某个 CPU 绑定的,work 一旦被 worker 开始执行,都是一直运行到某个 CPU 上的不会切换 CPU。

unbound worker_pool 相对应的意思,就是 worker 可以在多个 CPU 上调度的。但是他其实也是绑定的,只不过它绑定的单位不是 CPU 而是 node。所谓的 node 是对 NUMA(Non Uniform Memory Access Architecture) 系统来说的,NUMA 可能存在多个 node,每个 node 可能包含一个或者多个 CPU。

unbound worker_pool 也分成两类:

(1)unbound_std_wq。每个 node 对应一个 worker_pool,多个 node 就对应多个 worker_pool;

                    

                                                     unbound worker_pool: unbound_std_wq

对应的拓扑图如下:

                          

  

                                                                         unbound_std_wq topology

(2)ordered_wq。所有 node 对应一个 default worker_pool;

                           

                                                                       unbound worker_pool: ordered_wq

对应的拓扑图如下:

     

                                                                       ordered_wq topology

3 初始化

linux初始化的时候,会预先初始化worker_pool和workqueue。

kernel_init

    ----------->kernel_init_freeable

           ------------->do_pre_smp_initcalls

在do_pre_smp_initcalls函数中,会调用用如下宏定义的函数

#define early_initcall(fn)        __define_initcall(fn, early)

对于工作队列,是如此定义的:

early_initcall(init_workqueues);
static int __init init_workqueues(void)
{
	int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
	int i, cpu;
	/* make sure we have enough bits for OFFQ pool ID */
	BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
		     WORK_CPU_END * NR_STD_WORKER_POOLS);

	WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));

	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

	cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
	hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);

	wq_numa_init();
    // (1) 给每个 cpu 创建对应的 worker_pool
	/* initialize CPU pools */
	for_each_possible_cpu(cpu) {
		struct worker_pool *pool;

		i = 0;
		for_each_cpu_worker_pool(pool, cpu) {
			BUG_ON(init_worker_pool(pool));
			pool->cpu = cpu;// 指定 cpu
			cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
            // 指定进程优先级 nice
			pool->attrs->nice = std_nice[i++];
			pool->node = cpu_to_node(cpu);

			/* alloc pool ID */
			mutex_lock(&wq_pool_mutex);
			BUG_ON(worker_pool_assign_id(pool));
			mutex_unlock(&wq_pool_mutex);
		}
	}
    // (2) 给每个 worker_pool 创建第一个 worker
	/* create the initial worker */
	for_each_online_cpu(cpu) {
		struct worker_pool *pool;

		for_each_cpu_worker_pool(pool, cpu) {
			pool->flags &= ~POOL_DISASSOCIATED;
			BUG_ON(create_and_start_worker(pool) < 0);
		}
	}
    // (1) 初始化 normal 和 high nice 对应的 unbound attrs
	/* create default unbound wq attrs */
	for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
		struct workqueue_attrs *attrs;

		BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
		attrs->nice = std_nice[i];
		unbound_std_wq_attrs[i] = attrs;
	}
    //系统在初始化时创建了一批默认的 workqueue
	system_wq = alloc_workqueue("events", 0, 0);
	system_highpri_wq = alloc_workqueue("events_highpri", WQ_HIGHPRI, 0);
	system_long_wq = alloc_workqueue("events_long", 0, 0);
	system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
					    WQ_UNBOUND_MAX_ACTIVE);
	system_freezable_wq = alloc_workqueue("events_freezable",
					      WQ_FREEZABLE, 0);
	BUG_ON(!system_wq || !system_highpri_wq || !system_long_wq ||
	       !system_unbound_wq || !system_freezable_wq);
	return 0;
}

3.1 worker_pool的初始化

先看一下是如何初始化worker_pool的:

init_workqueues

           ---------->init_worker_pool

static int init_worker_pool(struct worker_pool *pool)
{
	spin_lock_init(&pool->lock);
	pool->id = -1;
	pool->cpu = -1;
	pool->node = NUMA_NO_NODE;
	pool->flags |= POOL_DISASSOCIATED;
// (1.1) worker_pool 的 work list,各个 workqueue 把 work 挂载到这个链表上,
	// 让 worker_pool 对应的多个 worker 来执行
	INIT_LIST_HEAD(&pool->worklist);
// (1.2) worker_pool 的 idle worker list,
	// worker 没有活干时,不会马上销毁,先进入 idle 状态备选
	INIT_LIST_HEAD(&pool->idle_list);
// (1.3) worker_pool 的 busy worker list,
	// worker 正在干活,在执行 work
	hash_init(pool->busy_hash);

// (1.4) 检查 idle 状态 worker 是否需要 destroy 的 timer
	init_timer_deferrable(&pool->idle_timer);
	pool->idle_timer.function = idle_worker_timeout;
	pool->idle_timer.data = (unsigned long)pool;
// (1.5) 在 worker_pool 创建新的 worker 时,检查是否超时的 timer
	setup_timer(&pool->mayday_timer, pool_mayday_timeout,
		    (unsigned long)pool);

	mutex_init(&pool->manager_arb);
	mutex_init(&pool->manager_mutex);
	idr_init(&pool->worker_idr);

	INIT_HLIST_NODE(&pool->hash_node);
	pool->refcnt = 1;

	/* shouldn't fail above this point */
	pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
	if (!pool->attrs)
		return -ENOMEM;
	return 0;
}

为worker_pool创建worker:

init_workqueues

    -------------->create_and_start_worker

static int create_and_start_worker(struct worker_pool *pool)
{
	struct worker *worker;

	mutex_lock(&pool->manager_mutex);

	worker = create_worker(pool);//创建worker
	if (worker) {
		spin_lock_irq(&pool->lock);
		start_worker(worker);//启动该worker
		spin_unlock_irq(&pool->lock);
	}

	mutex_unlock(&pool->manager_mutex);

	return worker ? 0 : -ENOMEM;
}

init_workqueues

    -------------->create_and_start_worker

         ------------------>create_worker

static struct worker *create_worker(struct worker_pool *pool)
{
	struct worker *worker = NULL;
	int id = -1;
	char id_buf[16];

	lockdep_assert_held(&pool->manager_mutex);

	/*
	 * ID is needed to determine kthread name.  Allocate ID first
	 * without installing the pointer.
	 */
	idr_preload(GFP_KERNEL);
	spin_lock_irq(&pool->lock);

	id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);

	spin_unlock_irq(&pool->lock);
	idr_preload_end();
	if (id < 0)
		goto fail;

	worker = alloc_worker();
	if (!worker)
		goto fail;

	worker->pool = pool;
	worker->id = id;

	if (pool->cpu >= 0)		// (2.1) 给 normal worker_pool 的 worker 构造进程名
		snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
			 pool->attrs->nice < 0  ? "H" : "");
	else		// (2.2) 给 unbound worker_pool 的 worker 构造进程名
		snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
	// (2.3) 创建 worker 对应的内核进程,执行函数为worker_thread,返回task描述符
	worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
					      "kworker/%s", id_buf);
	if (IS_ERR(worker->task))
		goto fail;

	/*
	 * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
	 * online CPUs.  It'll be re-applied when any of the CPUs come up.
	 */
        // (2.4) 设置内核进程对应的优先级 nice
	set_user_nice(worker->task, pool->attrs->nice);
	set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);

	/* prevent userland from meddling with cpumask of workqueue workers */
	worker->task->flags |= PF_NO_SETAFFINITY;

	/*
	 * The caller is responsible for ensuring %POOL_DISASSOCIATED
	 * remains stable across this function.  See the comments above the
	 * flag definition for details.
	 */
	if (pool->flags & POOL_DISASSOCIATED)
		worker->flags |= WORKER_UNBOUND;

	/* successful, commit the pointer to idr */
	spin_lock_irq(&pool->lock);
	idr_replace(&pool->worker_idr, worker, worker->id);
	spin_unlock_irq(&pool->lock);

	return worker;

fail:
	if (id >= 0) {
		spin_lock_irq(&pool->lock);
		idr_remove(&pool->worker_idr, id);
		spin_unlock_irq(&pool->lock);
	}
	kfree(worker);
	return NULL;
}

创建完worker以后,把该worker挂载入worker_pool的空闲链表,并唤醒该worker线程

create_and_start_worker

         --------->start_worker

static void start_worker(struct worker *worker)
{
	worker->flags |= WORKER_STARTED;
	worker->pool->nr_workers++;
	worker_enter_idle(worker);//将 worker 初始状态设置成 idle,
	wake_up_process(worker->task);	// wake_up_process 以后,worker 自动 leave idle 状态
}
static void worker_enter_idle(struct worker *worker)
{
	struct worker_pool *pool = worker->pool;

	if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
	    WARN_ON_ONCE(!list_empty(&worker->entry) &&
			 (worker->hentry.next || worker->hentry.pprev)))
		return;

	/* can't use worker_set_flags(), also called from start_worker() */
	worker->flags |= WORKER_IDLE;
	pool->nr_idle++;
	worker->last_active = jiffies;

	/* idle_list is LIFO */
	list_add(&worker->entry, &pool->idle_list);//加入空闲链表

	if (too_many_workers(pool) && !timer_pending(&pool->idle_timer))
		mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT);

	/*
	 * Sanity check nr_running.  Because wq_unbind_fn() releases
	 * pool->lock between setting %WORKER_UNBOUND and zapping
	 * nr_running, the warning may trigger spuriously.  Check iff
	 * unbind is not in progress.
	 */
	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
		     pool->nr_workers == pool->nr_idle &&
		     atomic_read(&pool->nr_running));
}

3.2 workqueue的初始化

在init_workqueues函数的最后,可以看到系统预先初始化了一些workqueue,正常来讲,用系统预分配的workqueue就能满足要求。workqueue分配函数为:alloc_workqueue

alloc_workqueue

     ------------>__alloc_workqueue_key

struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
					       unsigned int flags,
					       int max_active,
					       struct lock_class_key *key,
					       const char *lock_name, ...)
{
	size_t tbl_size = 0;
	va_list args;
	struct workqueue_struct *wq;
	struct pool_workqueue *pwq;

	/* allocate wq and format name */
	if (flags & WQ_UNBOUND)
		tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
        	// (1) 分配 workqueue_struct 数据结构
	wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
	if (!wq)
		return NULL;

	if (flags & WQ_UNBOUND) {
		wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
		if (!wq->unbound_attrs)
			goto err_free_wq;
	}

	va_start(args, lock_name);
	vsnprintf(wq->name, sizeof(wq->name), fmt, args);
	va_end(args);
	// (1) 分配 workqueue_struct 数据结构
	max_active = max_active ?: WQ_DFL_ACTIVE;
	max_active = wq_clamp_max_active(max_active, flags, wq->name);

	/* init wq */
	wq->flags = flags;
	wq->saved_max_active = max_active;
	mutex_init(&wq->mutex);
	atomic_set(&wq->nr_pwqs_to_flush, 0);
	INIT_LIST_HEAD(&wq->pwqs);
	INIT_LIST_HEAD(&wq->flusher_queue);
	INIT_LIST_HEAD(&wq->flusher_overflow);
	INIT_LIST_HEAD(&wq->maydays);

	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
	INIT_LIST_HEAD(&wq->list);
	// (3) 给 workqueue 分配对应的 pool_workqueue
	// pool_workqueue 将 workqueue 和 worker_pool 链接起来
	if (alloc_and_link_pwqs(wq) < 0)
		goto err_free_wq;

	/*
	 * Workqueues which may be used during memory reclaim should
	 * have a rescuer to guarantee forward progress.
	 */
	// (4) 如果是 WQ_MEM_RECLAIM 类型的 workqueue
	// 创建对应的 rescuer_thread() 内核进程,暂时不讨论这种情况
	if (flags & WQ_MEM_RECLAIM) {
		struct worker *rescuer;

		rescuer = alloc_worker();
		if (!rescuer)
			goto err_destroy;

		rescuer->rescue_wq = wq;
		rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
					       wq->name);
		if (IS_ERR(rescuer->task)) {
			kfree(rescuer);
			goto err_destroy;
		}

		wq->rescuer = rescuer;
		rescuer->task->flags |= PF_NO_SETAFFINITY;
		wake_up_process(rescuer->task);
	}
           // (5) 如果是需要,创建 workqueue 对应的 sysfs 文件
	if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
		goto err_destroy;

	/*
	 * wq_pool_mutex protects global freeze state and workqueues list.
	 * Grab it, adjust max_active and add the new @wq to workqueues
	 * list.
	 */
	mutex_lock(&wq_pool_mutex);

	mutex_lock(&wq->mutex);
	for_each_pwq(pwq, wq)
		pwq_adjust_max_active(pwq);
	mutex_unlock(&wq->mutex);
	// (6) 将新的 workqueue 加入到全局链表 workqueues 中
	list_add(&wq->list, &workqueues);

	mutex_unlock(&wq_pool_mutex);

	return wq;

err_free_wq:
	free_workqueue_attrs(wq->unbound_attrs);
	kfree(wq);
	return NULL;
err_destroy:
	destroy_workqueue(wq);
	return NULL;
}

上面函数申请了一个workqueue,并利用pool_workqueue结构,把workqueue_struct和worker_pool关联起来。具体的关联函数为alloc_and_link_pwqs

alloc_workqueue

     ------------>__alloc_workqueue_key

           ---------------->alloc_and_link_pwqs

static int alloc_and_link_pwqs(struct workqueue_struct *wq)
{
	bool highpri = wq->flags & WQ_HIGHPRI;
	int cpu;
	// (3.1) normal workqueue
	// pool_workqueue 链接 workqueue 和 worker_pool 的过程
	if (!(wq->flags & WQ_UNBOUND)) {
		wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
		if (!wq->cpu_pwqs)
			return -ENOMEM;
     		// 给 workqueue 的每个 cpu 分配对应的 pool_workqueue,赋值给 wq->cpu_pwqs           
		for_each_possible_cpu(cpu) {
			struct pool_workqueue *pwq =
				per_cpu_ptr(wq->cpu_pwqs, cpu);
			struct worker_pool *cpu_pools =
				per_cpu(cpu_worker_pools, cpu);
                  // 将初始化时已经创建好的 normal worker_pool,赋值给 pool_workqueue
			init_pwq(pwq, wq, &cpu_pools[highpri]);

			mutex_lock(&wq->mutex);
                // 将 pool_workqueue 和 workqueue 链接起来
			link_pwq(pwq);
			mutex_unlock(&wq->mutex);
		}
		return 0;
	} else {
		return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
	}
}

static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
		     struct worker_pool *pool)
{
	BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);

	memset(pwq, 0, sizeof(*pwq));

	pwq->pool = pool;//把normal worker_pool,赋值给 pool_workqueue
	pwq->wq = wq;
	pwq->flush_color = -1;
	pwq->refcnt = 1;
	INIT_LIST_HEAD(&pwq->delayed_works);
	INIT_LIST_HEAD(&pwq->pwqs_node);
	INIT_LIST_HEAD(&pwq->mayday_node);
	INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
}

static void link_pwq(struct pool_workqueue *pwq)
{
	struct workqueue_struct *wq = pwq->wq;

	lockdep_assert_held(&wq->mutex);

	/* may be called multiple times, ignore if already linked */
	if (!list_empty(&pwq->pwqs_node))
		return;

	/*
	 * Set the matching work_color.  This is synchronized with
	 * wq->mutex to avoid confusing flush_workqueue().
	 */
	pwq->work_color = wq->work_color;

	/* sync max_active to the current setting */
	pwq_adjust_max_active(pwq);

	/* link in @pwq */
	list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
}

初始化完worker_pool和workqueue_struct以后,我们就可以使用他们来完成具体的work了。

3.3 work_struct的初始化

一般可以使用INIT_WORK(_work, _func)  宏 可以初始化一个work,_func是需要具体执行的work:

#define INIT_WORK(_work, _func)						\
do {								\
	__INIT_WORK((_work), (_func), 0);			\
} while (0)

#define __INIT_WORK(_work, _func, _onstack)				\
do {								\
	__init_work((_work), _onstack);				\
	(_work)->data = (atomic_long_t) WORK_DATA_INIT();	\
	INIT_LIST_HEAD(&(_work)->entry);			\
	PREPARE_WORK((_work), (_func));				\
} while (0)

#define PREPARE_WORK(_work, _func)					\
do {								\
	(_work)->func = (_func);				\
} while (0)

该宏的作用就是把work_struct的func成员设置成work函数。

4 work的执行原理

可以使用queue_work函数把work_struct加入工作队列,函数原型如下,参数分别是要加入的work,和队列workqueue_struct

static inline bool queue_work(struct workqueue_struct *wq,
			      struct work_struct *work)
{
	return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
bool queue_work_on(int cpu, struct workqueue_struct *wq,
		   struct work_struct *work)
{
	bool ret = false;
	unsigned long flags;

	local_irq_save(flags);

	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
		__queue_work(cpu, wq, work);
		ret = true;
	}

	local_irq_restore(flags);
	return ret;
}

核心函数是__queue_work

static void __queue_work(int cpu, struct workqueue_struct *wq,
			 struct work_struct *work)
{
	struct pool_workqueue *pwq;
	struct worker_pool *last_pool;
	struct list_head *worklist;
	unsigned int work_flags;
	unsigned int req_cpu = cpu;

	/*
	 * While a work item is PENDING && off queue, a task trying to
	 * steal the PENDING will busy-loop waiting for it to either get
	 * queued or lose PENDING.  Grabbing PENDING and queueing should
	 * happen with IRQ disabled.
	 */
	WARN_ON_ONCE(!irqs_disabled());

	debug_work_activate(work);

	/* if dying, only works from the same workqueue are allowed */
	if (unlikely(wq->flags & __WQ_DRAINING) &&
	    WARN_ON_ONCE(!is_chained_work(wq)))
		return;
retry:	// (1) 如果没有指定 cpu,则使用当前 cpu
	if (req_cpu == WORK_CPU_UNBOUND)
		cpu = raw_smp_processor_id();

	/* pwq which will be used unless @work is executing elsewhere */
	if (!(wq->flags & WQ_UNBOUND))
		// (2) 对于 normal wq,使用当前 cpu 对应的 normal worker_pool
		pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
	else
               // (3) 对于 unbound wq,使用当前 cpu 对应 node 的 worker_pool
		pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));

	/*
	 * If @work was previously on a different pool, it might still be
	 * running there, in which case the work needs to be queued on that
	 * pool to guarantee non-reentrancy.
	 */
	// (4) 如果 work 在其他 worker 上正在被执行,把 work 压到对应的 worker 上去
	// 避免 work 出现重入的问题
	last_pool = get_work_pool(work);
	if (last_pool && last_pool != pwq->pool) {
		struct worker *worker;

		spin_lock(&last_pool->lock);

		worker = find_worker_executing_work(last_pool, work);

		if (worker && worker->current_pwq->wq == wq) {
			pwq = worker->current_pwq;
		} else {
			/* meh... not running there, queue here */
			spin_unlock(&last_pool->lock);
			spin_lock(&pwq->pool->lock);
		}
	} else {
		spin_lock(&pwq->pool->lock);
	}

	/*
	 * pwq is determined and locked.  For unbound pools, we could have
	 * raced with pwq release and it could already be dead.  If its
	 * refcnt is zero, repeat pwq selection.  Note that pwqs never die
	 * without another pwq replacing it in the numa_pwq_tbl or while
	 * work items are executing on it, so the retrying is guaranteed to
	 * make forward-progress.
	 */
	if (unlikely(!pwq->refcnt)) {
		if (wq->flags & WQ_UNBOUND) {
			spin_unlock(&pwq->pool->lock);
			cpu_relax();
			goto retry;
		}
		/* oops */
		WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
			  wq->name, cpu);
	}

	/* pwq determined, queue */
	trace_workqueue_queue_work(req_cpu, pwq, work);

	if (WARN_ON(!list_empty(&work->entry))) {
		spin_unlock(&pwq->pool->lock);
		return;
	}

	pwq->nr_in_flight[pwq->work_color]++;
	work_flags = work_color_to_flags(pwq->work_color);
	// (5) 如果还没有达到 max_active,将 work 挂载到 pool->worklist
	if (likely(pwq->nr_active < pwq->max_active)) {
		trace_workqueue_activate_work(work);
		pwq->nr_active++;
		worklist = &pwq->pool->worklist;
	} else {
		work_flags |= WORK_STRUCT_DELAYED;
		worklist = &pwq->delayed_works;	// 否则,将 work 挂载到临时队列 pwq->delayed_works
	}
	// (6) 将 work 压入 worklist 当中
	insert_work(pwq, work, worklist, work_flags);

	spin_unlock(&pwq->pool->lock);
}

static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
			struct list_head *head, unsigned int extra_flags)
{
	struct worker_pool *pool = pwq->pool;

	/* we own @work, set data and link */
	set_work_pwq(work, pwq, extra_flags);
	list_add_tail(&work->entry, head);
	get_pwq(pwq);

	/*
	 * Ensure either wq_worker_sleeping() sees the above
	 * list_add_tail() or we see zero nr_running to avoid workers lying
	 * around lazily while there are works to be processed.
	 */
	smp_mb();

	if (__need_more_worker(pool))//如果没有worker再运行,则唤醒一个worker线程
		wake_up_worker(pool);//从pool的idle链表中取出一个worker,把其唤醒
}

从上可以看出__queue_work的任务是把work加入到worker_pool的worklist链表中,然后判断worker_pool中是否有worker在运行,如果没有,则唤醒一个worker线程去处理worker事项。

worker线程的处理函数为worker_thread:

static int worker_thread(void *__worker)
{
	struct worker *worker = __worker;
	struct worker_pool *pool = worker->pool;

	/* tell the scheduler that this is a workqueue worker */
	worker->task->flags |= PF_WQ_WORKER;
woke_up:
	spin_lock_irq(&pool->lock);

	/* am I supposed to die? */
	if (unlikely(worker->flags & WORKER_DIE)) {
		spin_unlock_irq(&pool->lock);
		WARN_ON_ONCE(!list_empty(&worker->entry));
		worker->task->flags &= ~PF_WQ_WORKER;
		return 0;
	}

	worker_leave_idle(worker);//前面初始化worker_pool的时候可以看到,新建的worker被挂在
recheck:                       //idle链表上,这边唤醒该线程以后,先把该worker从idle链表删除
	/* no more worker necessary? */
	if (!need_more_worker(pool))//  如果需要本 worker 继续执行则继续,否则进入 idle 状态
	// need more worker 的条件: (pool->worklist != 0) && (pool->nr_running == 0)
	// worklist 上有 work 需要执行,并且现在没有处于 running 的 work
		goto sleep;
//  如果 (pool->nr_idle == 0),则启动创建更多的 worker
	// 说明 idle 队列中已经没有备用 worker 了,先创建 一些 worker 备用
	/* do we need to manage? */
	if (unlikely(!may_start_working(pool)) && manage_workers(worker))
		goto recheck;

	/*
	 * ->scheduled list can only be filled while a worker is
	 * preparing to process a work or actually processing it.
	 * Make sure nobody diddled with it while I was sleeping.
	 */
	WARN_ON_ONCE(!list_empty(&worker->scheduled));

	/*
	 * Finish PREP stage.  We're guaranteed to have at least one idle
	 * worker or that someone else has already assumed the manager
	 * role.  This is where @worker starts participating in concurrency
	 * management if applicable and concurrency management is restored
	 * after being rebound.  See rebind_workers() for details.
	 */
	worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);

	do {
// 如果 pool->worklist 不为空,从其中取出一个 work 进行处理
		struct work_struct *work =
			list_first_entry(&pool->worklist,
					 struct work_struct, entry);

		if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) {
			/* optimization path, not strictly necessary */
			process_one_work(worker, work);//  执行正常的 work
			if (unlikely(!list_empty(&worker->scheduled)))
				process_scheduled_works(worker);
		} else {
            //执行系统特意 scheduled 给某个 worker 的 work
			// 普通的 work 是放在池子的公共 list 中的 pool->worklist
			// 只有一些特殊的 work 被特意派送给某个 worker 的 worker->scheduled
			// 包括:1、执行 flush_work 时插入的 barrier work;
			// 2、collision 时从其他 worker 推送到本 worker 的 work
			move_linked_works(work, &worker->scheduled, NULL);
			process_scheduled_works(worker);
		}
	} while (keep_working(pool));
    //worker keep_working 的条件:
	// pool->worklist 不为空 && (pool->nr_running <= 1)
	worker_set_flags(worker, WORKER_PREP, false);
sleep:
	if (unlikely(need_to_manage_workers(pool)) && manage_workers(worker))
		goto recheck;

	/*
	 * pool->lock is held and there's no work to process and no need to
	 * manage, sleep.  Workers are woken up only while holding
	 * pool->lock or from local cpu, so setting the current state
	 * before releasing pool->lock is enough to prevent losing any
	 * event.
	 */
	worker_enter_idle(worker);//挂回pool的idle链表
	__set_current_state(TASK_INTERRUPTIBLE);//设置worker thread睡眠
	spin_unlock_irq(&pool->lock);
	schedule();//主动调度
	goto woke_up;
}

worker具体处理work的函数为

worker_thread

        ------------->process_one_work:

static void process_one_work(struct worker *worker, struct work_struct *work)
__releases(&pool->lock)
__acquires(&pool->lock)
{
	struct pool_workqueue *pwq = get_work_pwq(work);
	struct worker_pool *pool = worker->pool;
	bool cpu_intensive = pwq->wq->flags & WQ_CPU_INTENSIVE;
	int work_color;
	struct worker *collision;
#ifdef CONFIG_LOCKDEP
	/*
	 * It is permissible to free the struct work_struct from
	 * inside the function that is called from it, this we need to
	 * take into account for lockdep too.  To avoid bogus "held
	 * lock freed" warnings as well as problems when looking into
	 * work->lockdep_map, make a copy and use that here.
	 */
	struct lockdep_map lockdep_map;
 
	lockdep_copy_map(&lockdep_map, &work->lockdep_map);
#endif
	/* ensure we're on the correct CPU */
	WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) &&
		     raw_smp_processor_id() != pool->cpu);
 
	// (8.1) 如果 work 已经在 worker_pool 的其他 worker 上执行,
	// 将 work 放入对应 worker 的 scheduled 队列中延后执行
	/*
	 * A single work shouldn't be executed concurrently by
	 * multiple workers on a single cpu.  Check whether anyone is
	 * already processing the work.  If so, defer the work to the
	 * currently executing one.
	 */
	collision = find_worker_executing_work(pool, work);
	if (unlikely(collision)) {
		move_linked_works(work, &collision->scheduled, NULL);
		return;
	}
 
	// (8.2) 将 worker 加入 busy 队列 pool->busy_hash
	/* claim and dequeue */
	debug_work_deactivate(work);
	hash_add(pool->busy_hash, &worker->hentry, (unsigned long)work);
	worker->current_work = work;
	worker->current_func = work->func;
	worker->current_pwq = pwq;
	work_color = get_work_color(work);
 
	list_del_init(&work->entry);
 
	// (8.3) 如果 work 所在的 wq 是 cpu 密集型的 WQ_CPU_INTENSIVE
	// 则当前 work 的执行脱离 worker_pool 的动态调度,成为一个独立的线程
	/*
	 * CPU intensive works don't participate in concurrency management.
	 * They're the scheduler's responsibility.  This takes @worker out
	 * of concurrency management and the next code block will chain
	 * execution of the pending work items.
	 */
	if (unlikely(cpu_intensive))
		worker_set_flags(worker, WORKER_CPU_INTENSIVE);
 
	// (8.4) 在 UNBOUND 或者 CPU_INTENSIVE work 中判断是否需要唤醒 idle worker
	// 普通 work 不会执行这个操作
	/*
	 * Wake up another worker if necessary.  The condition is always
	 * false for normal per-cpu workers since nr_running would always
	 * be >= 1 at this point.  This is used to chain execution of the
	 * pending work items for WORKER_NOT_RUNNING workers such as the
	 * UNBOUND and CPU_INTENSIVE ones.
	 */
	if (need_more_worker(pool))
		wake_up_worker(pool);
 
	/*
	 * Record the last pool and clear PENDING which should be the last
	 * update to @work.  Also, do this inside @pool->lock so that
	 * PENDING and queued state changes happen together while IRQ is
	 * disabled.
	 */
	set_work_pool_and_clear_pending(work, pool->id);
 
	spin_unlock_irq(&pool->lock);
 
	lock_map_acquire_read(&pwq->wq->lockdep_map);
	lock_map_acquire(&lockdep_map);
	trace_workqueue_execute_start(work);
	// (8.5) 执行 work 函数
	worker->current_func(work);
	/*
	 * While we must be careful to not use "work" after this, the trace
	 * point will only record its address.
	 */
	trace_workqueue_execute_end(work);
	lock_map_release(&lockdep_map);
	lock_map_release(&pwq->wq->lockdep_map);
 
	if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
		pr_err("BUG: workqueue leaked lock or atomic: %s/0x%08x/%d\n"
		       "     last function: %pf\n",
		       current->comm, preempt_count(), task_pid_nr(current),
		       worker->current_func);
		debug_show_held_locks(current);
		dump_stack();
	}
 
	/*
	 * The following prevents a kworker from hogging CPU on !PREEMPT
	 * kernels, where a requeueing work item waiting for something to
	 * happen could deadlock with stop_machine as such work item could
	 * indefinitely requeue itself while all other CPUs are trapped in
	 * stop_machine. At the same time, report a quiescent RCU state so
	 * the same condition doesn't freeze RCU.
	 */
	cond_resched_rcu_qs();
 
	spin_lock_irq(&pool->lock);
 
	/* clear cpu intensive status */
	if (unlikely(cpu_intensive))
		worker_clr_flags(worker, WORKER_CPU_INTENSIVE);
 
	/* we're done with it, release */
	hash_del(&worker->hentry);
	worker->current_work = NULL;
	worker->current_func = NULL;
	worker->current_pwq = NULL;
	worker->desc_valid = false;
	pwq_dec_nr_in_flight(pwq, work_color);
}

可以用一个图来大概描述各个结构之间的关系:

总结一下work的工作流程:

(1)通过workqueue_struct找到对应的pool_workqueue

(2)通过pool_workqueue找到worker_pool

(3)把work挂载到worker_pool的worklist链表中

(4)worker_thread获取work执行,如果没有运行的worker,则唤醒一个。 

怎么判断异步的 work 已经执行完成?这里面使用了一个技巧:在目标 work 的后面插入一个新的 work wq_barrier,如果 wq_barrier 执行完成,那么目标 work 肯定已经执行完成。

/**
 * flush_work - wait for a work to finish executing the last queueing instance
 * @work: the work to flush
 *
 * Wait until @work has finished execution.  @work is guaranteed to be idle
 * on return if it hasn't been requeued since flush started.
 *
 * Return:
 * %true if flush_work() waited for the work to finish execution,
 * %false if it was already idle.
 */
bool flush_work(struct work_struct *work)
{
	struct wq_barrier barr;
 
	lock_map_acquire(&work->lockdep_map);
	lock_map_release(&work->lockdep_map);
 
	if (start_flush_work(work, &barr)) {
		// 等待 barr work 执行完成的信号
		wait_for_completion(&barr.done);
		destroy_work_on_stack(&barr.work);
		return true;
	} else {
		return false;
	}
}

static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
{
	struct worker *worker = NULL;
	struct worker_pool *pool;
	struct pool_workqueue *pwq;
 
	might_sleep();
 
	// (1) 如果 work 所在 worker_pool 为 NULL,说明 work 已经执行完
	local_irq_disable();
	pool = get_work_pool(work);
	if (!pool) {
		local_irq_enable();
		return false;
	}
 
	spin_lock(&pool->lock);
	/* see the comment in try_to_grab_pending() with the same code */
	pwq = get_work_pwq(work);
	if (pwq) {
		// (2) 如果 work 所在 pwq 指向的 worker_pool 不等于上一步得到的 worker_pool,说明 work 已经执行完
		if (unlikely(pwq->pool != pool))
			goto already_gone;
	} else {
		// (3) 如果 work 所在 pwq 为 NULL,并且也没有在当前执行的 work 中,说明 work 已经执行完
		worker = find_worker_executing_work(pool, work);
		if (!worker)
			goto already_gone;
		pwq = worker->current_pwq;
	}
 
	// (4) 如果 work 没有执行完,向 work 的后面插入 barr work
	insert_wq_barrier(pwq, barr, work, worker);
	spin_unlock_irq(&pool->lock);
 
	/*
	 * If @max_active is 1 or rescuer is in use, flushing another work
	 * item on the same workqueue may lead to deadlock.  Make sure the
	 * flusher is not running on the same workqueue by verifying write
	 * access.
	 */
	if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
		lock_map_acquire(&pwq->wq->lockdep_map);
	else
		lock_map_acquire_read(&pwq->wq->lockdep_map);
	lock_map_release(&pwq->wq->lockdep_map);
 
	return true;
already_gone:
	spin_unlock_irq(&pool->lock);
	return false;
}

static void insert_wq_barrier(struct pool_workqueue *pwq,
			      struct wq_barrier *barr,
			      struct work_struct *target, struct worker *worker)
{
	struct list_head *head;
	unsigned int linked = 0;
 
	/*
	 * debugobject calls are safe here even with pool->lock locked
	 * as we know for sure that this will not trigger any of the
	 * checks and call back into the fixup functions where we
	 * might deadlock.
	 */
	// (4.1) barr work 的执行函数 wq_barrier_func()
	INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
	__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
	init_completion(&barr->done);
 
	/*
	 * If @target is currently being executed, schedule the
	 * barrier to the worker; otherwise, put it after @target.
	 */
	// (4.2) 如果 work 当前在 worker 中执行,则 barr work 插入 scheduled 队列
	if (worker)
		head = worker->scheduled.next;
	// 否则,则 barr work 插入正常的 worklist 队列中,插入位置在目标 work 后面
	// 并且置上 WORK_STRUCT_LINKED 标志
	else {
		unsigned long *bits = work_data_bits(target);
 
		head = target->entry.next;
		/* there can already be other linked works, inherit and set */
		linked = *bits & WORK_STRUCT_LINKED;
		__set_bit(WORK_STRUCT_LINKED_BIT, bits);
	}
 
	debug_work_activate(&barr->work);
	insert_work(pwq, &barr->work, head,
		    work_color_to_flags(WORK_NO_COLOR) | linked);
}

static void wq_barrier_func(struct work_struct *work)
{
	struct wq_barrier *barr = container_of(work, struct wq_barrier, work);
	// (4.1.1) barr work 执行完成,发出 complete 信号。
	complete(&barr->done);
}

5 Workqueue 对外接口函数

CMWQ 实现的 workqueue 机制,被包装成相应的对外接口函数。

5.1 schedule_work()

把 work 压入系统默认 wq system_wq,WORK_CPU_UNBOUND 指定 worker 为当前 CPU 绑定的 normal worker_pool 创建的 worker。

static inline bool schedule_work(struct work_struct *work)
{
	return queue_work(system_wq, work);
}
| →
static inline bool queue_work(struct workqueue_struct *wq,
			      struct work_struct *work)
{
	return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}

5.2 schedule_work_on()

static inline bool schedule_work_on(int cpu, struct work_struct *work)
{
	return queue_work_on(cpu, system_wq, work);
}

5.3 schedule_delayed_work()

启动一个 timer,在 timer 定时到了以后调用 delayed_work_timer_fn() 把 work 压入系统默认 wq system_wq。

static inline bool schedule_delayed_work(struct delayed_work *dwork,
					 unsigned long delay)
{
	return queue_delayed_work(system_wq, dwork, delay);
}

static inline bool queue_delayed_work(struct workqueue_struct *wq,
				      struct delayed_work *dwork,
				      unsigned long delay)
{
	return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
}

bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
			   struct delayed_work *dwork, unsigned long delay)
{
	struct work_struct *work = &dwork->work;
	bool ret = false;
	unsigned long flags;
 
	/* read the comment in __queue_work() */
	local_irq_save(flags);
 
	if (!test_and_set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work))) {
		__queue_delayed_work(cpu, wq, dwork, delay);
		ret = true;
	}
 
	local_irq_restore(flags);
	return ret;
}

static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
				struct delayed_work *dwork, unsigned long delay)
{
	struct timer_list *timer = &dwork->timer;
	struct work_struct *work = &dwork->work;
 
	WARN_ON_ONCE(timer->function != delayed_work_timer_fn ||
		     timer->data != (unsigned long)dwork);
	WARN_ON_ONCE(timer_pending(timer));
	WARN_ON_ONCE(!list_empty(&work->entry));
 
	/*
	 * If @delay is 0, queue @dwork->work immediately.  This is for
	 * both optimization and correctness.  The earliest @timer can
	 * expire is on the closest next tick and delayed_work users depend
	 * on that there's no such delay when @delay is 0.
	 */
	if (!delay) {
		__queue_work(cpu, wq, &dwork->work);
		return;
	}
 
	timer_stats_timer_set_start_info(&dwork->timer);
 
	dwork->wq = wq;
	dwork->cpu = cpu;
	timer->expires = jiffies + delay;
 
	if (unlikely(cpu != WORK_CPU_UNBOUND))
		add_timer_on(timer, cpu);
	else
		add_timer(timer);
}

void delayed_work_timer_fn(unsigned long __data)
{
	struct delayed_work *dwork = (struct delayed_work *)__data;
 
	/* should have been called from irqsafe timer with irq already off */
	__queue_work(dwork->cpu, dwork->wq, &dwork->work);
}

本文很多内容出自该文:http://kernel.meizu.com/linux-workqueue.html

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值