Linux read系统调用之 submit_bio()

本文链接：https://blog.csdn.net/weixin_42205011/article/details/98731459

1 submit_bio()

接上一篇文章，mpage_bio_submit() 函数最终调用 submit_bio()，将 bio 请求提交到 generic block layer。

void submit_bio(int rw, struct bio *bio)
{
	bio->bi_rw |= rw;

	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio)) {
		unsigned int count;

		if (unlikely(rw & REQ_WRITE_SAME))
			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
		else
			count = bio_sectors(bio);

		if (rw & WRITE) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_iter.bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
			current->comm, task_pid_nr(current),
				(rw & WRITE) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_iter.bi_sector,
				bdevname(bio->bi_bdev, b),
				count);
		}
	}

	generic_make_request(bio);		//block/blk-core.c/line1905
}

如果 bio 中有数据，做一些通用处理（没太明白）。
调用 generic_make_request() 。

2 generic_make_request()

void generic_make_request(struct bio *bio)
{
	struct bio_list bio_list_on_stack;

	if (!generic_make_request_checks(bio))
		return;

	/*
	 * We only want one ->make_request_fn to be active at a time, else
	 * stack usage with stacked devices could be a problem.  So use
	 * current->bio_list to keep a list of requests submited by a
	 * make_request_fn function.  current->bio_list is also used as a
	 * flag to say if generic_make_request is currently active in this
	 * task or not.  If it is NULL, then no make_request is active.  If
	 * it is non-NULL, then a make_request is active, and new requests
	 * should be added at the tail
	 */
	if (current->bio_list) {
		bio_list_add(current->bio_list, bio);
		return;
	}

	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to a pointer to the bio_list_on_stack,
	 * thus initialising the bio_list of new bios to be
	 * added.  ->make_request() may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so remove it from
	 * bio_list, and call into ->make_request() again.
	 */
	BUG_ON(bio->bi_next);
	bio_list_init(&bio_list_on_stack);
	current->bio_list = &bio_list_on_stack;
	do {
		struct request_queue *q = bdev_get_queue(bio->bi_bdev);

		q->make_request_fn(q, bio);			//line1583/blk_queue_bio()

		bio = bio_list_pop(current->bio_list);
	} while (bio);
	current->bio_list = NULL; /* deactivate */
}

这个函数注释比函数本体还多，说明这个函数逻辑有点奇怪。

首先检查一下 bio 是否合法。
如果 current->bio_list 存在，将 bio 加入 bio_list ，然后返回。
初始化一个 bio_list，赋值给 current->bio_list。
从 bio 中取出本次 bio 要用到的 bdev 设备的 request_queue。
调用 q->make_request_fn()。
将 bio 从 current->bio_list pop 掉。
检查 bio 是否为空，如果是，清空 current->bio_list，如果不是，继续循环执行。

这个函数诡异的地方在于，q->make_request_fn 中可能会递归调用 generic_make_request。再结合注释就比较容易看懂了。

3 q->make_request_fn()

首先来看一下这个函数是在哪赋值的

struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
			 spinlock_t *lock)
{
	if (!q)
		return NULL;

	q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
	if (!q->fq)
		return NULL;

	if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
		goto fail;

	q->request_fn		= rfn;
	q->prep_rq_fn		= NULL;
	q->unprep_rq_fn		= NULL;
	q->queue_flags		|= QUEUE_FLAG_DEFAULT;

	/* Override internal queue lock with supplied lock pointer */
	if (lock)
		q->queue_lock		= lock;

	/*
	 * This also sets hw/phys segments, boundary and size
	 */
	blk_queue_make_request(q, blk_queue_bio);

	q->sg_reserved_size = INT_MAX;

	/* Protect q->elevator from elevator_change */
	mutex_lock(&q->sysfs_lock);

	/* init elevator */
	if (elevator_init(q, NULL)) {
		mutex_unlock(&q->sysfs_lock);
		goto fail;
	}

	mutex_unlock(&q->sysfs_lock);

	return q;

fail:
	blk_free_flush_queue(q->fq);
	return NULL;
}

申请 blk_flush_queue。
初始化 request_list。
设置请求处理函数，当内核期望驱动程序执行某些动作时，就会使用这个函数。
调用 blk_queue_make_request()，将 q->make_request_fn 赋值为 blk_queue_bio() 函数。
初始化调度算法。

static void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
	const bool sync = !!(bio->bi_rw & REQ_SYNC);
	struct blk_plug *plug;
	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
	struct request *req;
	unsigned int request_count = 0;

	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);

	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
		bio_endio(bio, -EIO);
		return;
	}

	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
		spin_lock_irq(q->queue_lock);
		where = ELEVATOR_INSERT_FLUSH;
		goto get_rq;
	}

	/*
	 * Check if we can merge with the plugged list before grabbing
	 * any locks.
	 */
	if (!blk_queue_nomerges(q) &&
	    blk_attempt_plug_merge(q, bio, &request_count))
		return;

	spin_lock_irq(q->queue_lock);

	el_ret = elv_merge(q, &req, bio);
	if (el_ret == ELEVATOR_BACK_MERGE) {
		if (bio_attempt_back_merge(q, req, bio)) {
			elv_bio_merged(q, req, bio);
			if (!attempt_back_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
		if (bio_attempt_front_merge(q, req, bio)) {
			elv_bio_merged(q, req, bio);
			if (!attempt_front_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	}

get_rq:
	/*
	 * This sync check and mask will be re-done in init_request_from_bio(),
	 * but we need to set it earlier to expose the sync flag to the
	 * rq allocator and io schedulers.
	 */
	rw_flags = bio_data_dir(bio);
	if (sync)
		rw_flags |= REQ_SYNC;

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	req = get_request(q, rw_flags, bio, GFP_NOIO);
	if (IS_ERR(req)) {
		bio_endio(bio, PTR_ERR(req));	/* @q is dead */
		goto out_unlock;
	}

	/*
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	init_request_from_bio(req, bio);

	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
		req->cpu = raw_smp_processor_id();

	plug = current->plug;
	if (plug) {
		/*
		 * If this is the first request added after a plug, fire
		 * of a plug trace.
		 */
		if (!request_count)
			trace_block_plug(q);
		else {
			if (request_count >= BLK_MAX_REQUEST_COUNT) {
				blk_flush_plug_list(plug, false);
				trace_block_plug(q);
			}
		}
		list_add_tail(&req->queuelist, &plug->list);
		blk_account_io_start(req, true);
	} else {
		spin_lock_irq(q->queue_lock);
		add_acct_request(q, req, where);
		__blk_run_queue(q);
out_unlock:
		spin_unlock_irq(q->queue_lock);
	}
}

blk_queue_bounce 函数的功能具体请看 ULK3第581页。
数据完整性校验。
请求队列允许 merge ，将 bio merge 到当前 plugged 的请求队列中。
elv_merge 是核心函数，找到 bio 前向或者后向合并的请求。
进行后向合并操作，或者进行前向合并操作。
如果无法找到对应的请求实现合并，调用 get_request() 获取一个 empty request 请求。
采用 bio 对 request 请求进行初始化。
若线程有 plug 队列，当请求数量达到队列上限值，进行 unplug 操作，否则，将请求加入 plug 队列。等到 read_pages 函数退出时再进行 unplug 操作。
若线程没有 plug 队列，调用 add_acct_request 加入 request_queue。