1 submit_bio()
接上一篇文章,mpage_bio_submit() 函数最终调用 submit_bio(),将 bio 请求提交到 generic block layer。
void submit_bio(int rw, struct bio *bio)
{
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;
if (unlikely(rw & REQ_WRITE_SAME))
count = bdev_logical_block_size(bio->bi_bdev) >> 9;
else
count = bio_sectors(bio);
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_iter.bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
generic_make_request(bio); //block/blk-core.c/line1905
}
- 如果 bio 中有数据,做一些通用处理(没太明白)。
- 调用 generic_make_request() 。
2 generic_make_request()
void generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
if (!generic_make_request_checks(bio))
return;
/*
* We only want one ->make_request_fn to be active at a time, else
* stack usage with stacked devices could be a problem. So use
* current->bio_list to keep a list of requests submited by a
* make_request_fn function. current->bio_list is also used as a
* flag to say if generic_make_request is currently active in this
* task or not. If it is NULL, then no make_request is active. If
* it is non-NULL, then a make_request is active, and new requests
* should be added at the tail
*/
if (current->bio_list) {
bio_list_add(current->bio_list, bio);
return;
}
/* following loop may be a bit non-obvious, and so deserves some
* explanation.
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
* we assign bio_list to a pointer to the bio_list_on_stack,
* thus initialising the bio_list of new bios to be
* added. ->make_request() may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so remove it from
* bio_list, and call into ->make_request() again.
*/
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
current->bio_list = &bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
q->make_request_fn(q, bio); //line1583/blk_queue_bio()
bio = bio_list_pop(current->bio_list);
} while (bio);
current->bio_list = NULL; /* deactivate */
}
这个函数注释比函数本体还多,说明这个函数逻辑有点奇怪。
- 首先检查一下 bio 是否合法。
- 如果 current->bio_list 存在,将 bio 加入 bio_list ,然后返回。
- 初始化一个 bio_list,赋值给 current->bio_list。
- 从 bio 中取出本次 bio 要用到的 bdev 设备的 request_queue。
- 调用 q->make_request_fn()。
- 将 bio 从 current->bio_list pop 掉。
- 检查 bio 是否为空,如果是,清空 current->bio_list,如果不是,继续循环执行。
这个函数诡异的地方在于,q->make_request_fn 中可能会递归调用 generic_make_request。再结合注释就比较容易看懂了。
3 q->make_request_fn()
首先来看一下这个函数是在哪赋值的
struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
spinlock_t *lock)
{
if (!q)
return NULL;
q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, 0);
if (!q->fq)
return NULL;
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
goto fail;
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unprep_rq_fn = NULL;
q->queue_flags |= QUEUE_FLAG_DEFAULT;
/* Override internal queue lock with supplied lock pointer */
if (lock)
q->queue_lock = lock;
/*
* This also sets hw/phys segments, boundary and size
*/
blk_queue_make_request(q, blk_queue_bio);
q->sg_reserved_size = INT_MAX;
/* Protect q->elevator from elevator_change */
mutex_lock(&q->sysfs_lock);
/* init elevator */
if (elevator_init(q, NULL)) {
mutex_unlock(&q->sysfs_lock);
goto fail;
}
mutex_unlock(&q->sysfs_lock);
return q;
fail:
blk_free_flush_queue(q->fq);
return NULL;
}
- 申请 blk_flush_queue。
- 初始化 request_list。
- 设置请求处理函数,当内核期望驱动程序执行某些动作时,就会使用这个函数。
- 调用 blk_queue_make_request(),将 q->make_request_fn 赋值为 blk_queue_bio() 函数。
- 初始化调度算法。
static void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
const bool sync = !!(bio->bi_rw & REQ_SYNC);
struct blk_plug *plug;
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
blk_queue_bounce(q, &bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
bio_endio(bio, -EIO);
return;
}
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
spin_lock_irq(q->queue_lock);
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
/*
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
if (!blk_queue_nomerges(q) &&
blk_attempt_plug_merge(q, bio, &request_count))
return;
spin_lock_irq(q->queue_lock);
el_ret = elv_merge(q, &req, bio);
if (el_ret == ELEVATOR_BACK_MERGE) {
if (bio_attempt_back_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
} else if (el_ret == ELEVATOR_FRONT_MERGE) {
if (bio_attempt_front_merge(q, req, bio)) {
elv_bio_merged(q, req, bio);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
}
get_rq:
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
* rq allocator and io schedulers.
*/
rw_flags = bio_data_dir(bio);
if (sync)
rw_flags |= REQ_SYNC;
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
bio_endio(bio, PTR_ERR(req)); /* @q is dead */
goto out_unlock;
}
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
init_request_from_bio(req, bio);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
req->cpu = raw_smp_processor_id();
plug = current->plug;
if (plug) {
/*
* If this is the first request added after a plug, fire
* of a plug trace.
*/
if (!request_count)
trace_block_plug(q);
else {
if (request_count >= BLK_MAX_REQUEST_COUNT) {
blk_flush_plug_list(plug, false);
trace_block_plug(q);
}
}
list_add_tail(&req->queuelist, &plug->list);
blk_account_io_start(req, true);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where);
__blk_run_queue(q);
out_unlock:
spin_unlock_irq(q->queue_lock);
}
}
- blk_queue_bounce 函数的功能具体请看 ULK3第581页。
- 数据完整性校验。
- 请求队列允许 merge ,将 bio merge 到当前 plugged 的请求队列中。
- elv_merge 是核心函数,找到 bio 前向或者后向合并的请求。
- 进行后向合并操作,或者进行前向合并操作。
- 如果无法找到对应的请求实现合并,调用 get_request() 获取一个 empty request 请求。
- 采用 bio 对 request 请求进行初始化。
- 若线程有 plug 队列,当请求数量达到队列上限值,进行 unplug 操作,否则,将请求加入 plug 队列。等到 read_pages 函数退出时再进行 unplug 操作。
- 若线程没有 plug 队列,调用 add_acct_request 加入 request_queue。