一直在困惑文件系统sys_read、bio、io调度、硬中断、软中断、io完成通知之间的过程是怎么样的,通过代码的跟踪大致明白过程了
内核态的系统调用与bio的交界处的函数mpage_bio_submit(fs/mpage.c)
bio与io调度的交界处的函数__make_request(block/blk-core.c)
io调度与驱动层的交界函数__generic_unplug_device(block/blk-core.c)
io的返回路径
在驱动层的io完成之后,注册的设备完成方法中(例如:scsi_done,__scsi_done)中则会执行blk_complete_request
blk_complete_request是硬件的设备驱动的硬件中断上下文的最后一个函数了
blk_complete_request将会启动软中断BLOCK_SOFTIRQ
这个过程向上的通知的过程到达bio层的时候,调用的函数就是 mpage_bio_submit中注册的bio的end_io(mpage_end_io_read| mpage_end_io_write)
代码流程
Fs/mpage.c
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
sector_t *last_block_in_bio, struct buffer_head *map_bh,
unsigned long *first_logical_block, get_block_t get_block)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
unsigned first_hole = blocks_per_page;
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
unsigned nblocks;
unsigned relative_block;
if (page_has_buffers(page))
goto confused;
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
nblocks = map_bh->b_size >> blkbits;
if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
block_in_file < (*first_logical_block + nblocks)) {
unsigned map_offset = block_in_file - *first_logical_block;
unsigned last = nblocks - map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
clear_buffer_mapped(map_bh);
break;
}
if (page_block == blocks_per_page)
break;
blocks[page_block] = map_bh->b_blocknr + map_offset +
relative_block;
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
map_bh->b_page = page;
while (page_block < blocks_per_page) {
map_bh->b_state = 0;
map_bh->b_size = 0;
if (block_in_file < last_block) {
map_bh->b_size = (last_block-block_in_file) << blkbits;
if (get_block(inode, block_in_file, map_bh, 0))
goto confused;
*first_logical_block = block_in_file;
}
if (!buffer_mapped(map_bh)) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
if (buffer_uptodate(map_bh)) {
map_buffer_to_page(page, map_bh, page_block);
goto confused;
}
if (first_hole != blocks_per_page)
goto confused; /* hole -> non-hole */
if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
goto confused;
nblocks = map_bh->b_size >> blkbits;
for (relative_block = 0; ; relative_block++) {
if (relative_block == nblocks) {
clear_buffer_mapped(map_bh);
break;
} else if (page_block == blocks_per_page)
break;
blocks[page_block] = map_bh->b_blocknr+relative_block;
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
goto out;
}
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && (*last_block_in_bio != blocks[0] - 1))
bio = mpage_bio_submit(READ, bio);
alloc_new:
if (bio == NULL) {
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
GFP_KERNEL);
if (bio == NULL)
goto confused;
}
length = first_hole << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(READ, bio);
goto alloc_new;
}
relative_block = block_in_file - *first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
bio = mpage_bio_submit(READ, bio);
else
*last_block_in_bio = blocks[blocks_per_page - 1];
out:
return bio;
confused:
if (bio)
bio = mpage_bio_submit(READ, bio);
if (!PageUptodate(page))
block_read_full_page(page, get_block);
else
unlock_page(page);
goto out;
}
do_mpage_readpage主要是将page转换为bio
do_mpage_readpage中重点关注mpage_bio_submit和block_read_full_pageFs/buffer.c
int block_read_full_page(struct page *page, get_block_t *get_block)
{
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
unsigned int blocksize;
int nr, i;
int fully_mapped = 1;
BUG_ON(!PageLocked(page));
blocksize = 1 << inode->i_blkbits;
if (!page_has_buffers(page))
create_empty_buffers(page, blocksize, 0);
head = page_buffers(page);
iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
bh = head;
nr = 0;
i = 0;
do {
if (buffer_uptodate(bh))
continue;
if (!buffer_mapped(bh)) {
int err = 0;
fully_mapped = 0;
if (iblock < lblock) {
WARN_ON(bh->b_size != blocksize);
err = get_block(inode, iblock, bh, 0);
if (err)
SetPageError(page);
}
if (!buffer_mapped(bh)) {
zero_user(page, i * blocksize, blocksize);
if (!err)
set_buffer_uptodate(bh);
continue;
}
if (buffer_uptodate(bh))
continue;
}
arr[nr++] = bh;
} while (i++, iblock++, (bh = bh->b_this_page) != head);
if (fully_mapped)
SetPageMappedToDisk(page);
if (!nr) {
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
return 0;
}
/* Stage two: lock the buffers */
for (i = 0; i < nr; i++) {
bh = arr[i];
lock_buffer(bh);
mark_buffer_async_read(bh);
}
for (i = 0; i < nr; i++) {
bh = arr[i];
if (buffer_uptodate(bh))
end_buffer_async_read(bh, 1);
else
submit_bh(READ, bh);
}
return 0;
}
重点关注submit_bh
Fs/buffer.c
int submit_bh(int rw, struct buffer_head * bh)
{
struct bio *bio;
int ret = 0;
if (buffer_ordered(bh) && (rw & WRITE))
rw |= WRITE_BARRIER;
if (test_set_buffer_req(bh) && (rw & WRITE))
clear_buffer_write_io_error(bh);
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
bio->bi_io_vec[0].bv_len = bh->b_size;
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
bio->bi_vcnt = 1;
bio->bi_idx = 0;
bio->bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
bio_get(bio);
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
do_mpage_readpage中执行mpage_bio_submit
static struct bio *mpage_bio_submit(int rw,struct bio *bio)
{
bio->bi_end_io= mpage_end_io_read;
if(rw == WRITE)
bio->bi_end_io= mpage_end_io_write;
submit_bio(rw, bio);
returnNULL;
}
Block/blk-core.c
void submit_bio(int rw, struct bio *bio)
{
int count = bio_sectors(bio);
bio->bi_rw |= rw;
if (bio_has_data(bio)) {
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev, b));
}
}
generic_make_request(bio);
}
generic_make_request将会把请求递交给io调度层
void generic_make_request(struct bio *bio)
{
if (current->bio_tail) {
/* make_request is active */
*(current->bio_tail) = bio;
bio->bi_next = NULL;
current->bio_tail = &bio->bi_next;
return;
}
BUG_ON(bio->bi_next);
do {
current->bio_list = bio->bi_next;
if (bio->bi_next == NULL)
current->bio_tail = ¤t->bio_list;
else
bio->bi_next = NULL;
__generic_make_request(bio);
bio = current->bio_list;
} while (bio);
current->bio_tail = NULL; /* deactivate */
}
static inline void __generic_make_request(struct bio *bio)
{
struct request_queue *q;
sector_t old_sector;
int ret, nr_sectors = bio_sectors(bio);
dev_t old_dev;
int err = -EIO;
might_sleep();
if (bio_check_eod(bio, nr_sectors))
goto end_io;
old_sector = -1;
old_dev = 0;
do {
char b[BDEVNAME_SIZE];
q = bdev_get_queue(bio->bi_bdev);
if (unlikely(!q)) {
goto end_io;
}
if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
nr_sectors > queue_max_hw_sectors(q))) {
goto end_io;
}
if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
goto end_io;
if (should_fail_request(bio))
goto end_io;
blk_partition_remap(bio);
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
goto end_io;
if (old_sector != -1)
trace_block_remap(q, bio, old_dev, old_sector);
old_sector = bio->bi_sector;
old_dev = bio->bi_bdev->bd_dev;
if (bio_check_eod(bio, nr_sectors))
goto end_io;
if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
!blk_queue_discard(q)) {
err = -EOPNOTSUPP;
goto end_io;
}
trace_block_bio_queue(q, bio);
ret = q->make_request_fn(q, bio);
} while (ret);
return;
end_io:
bio_endio(bio, err);
}
make_request_fn是何时指定的呢?
需要关注请求如何从page=>bh=>bio=>request=>elevator
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
{
q->nr_requests = BLKDEV_MAX_RQ;
q->make_request_fn = mfn;
blk_queue_dma_alignment(q, 511);
blk_queue_congestion_threshold(q);
q->nr_batching = BLK_BATCH_REQ;
q->unplug_thresh = 4; /* hmm */
q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
if (q->unplug_delay == 0)
q->unplug_delay = 1;
q->unplug_timer.function = blk_unplug_timeout;
q->unplug_timer.data = (unsigned long)q;
blk_set_default_limits(&q->limits);
blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
if (!q->queue_lock)
q->queue_lock = &q->__queue_lock;
blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
}
Block/blk-core.c
struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
if (!q)
return NULL;
q->node = node_id;
if (blk_init_free_list(q)) {
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
q->request_fn = rfn;
q->prep_rq_fn = NULL;
q->unplug_fn = generic_unplug_device;
q->queue_flags = QUEUE_FLAG_DEFAULT;
q->queue_lock = lock;
blk_queue_make_request(q, __make_request);
q->sg_reserved_size = INT_MAX;
if (!elevator_init(q, NULL)) {
blk_queue_congestion_threshold(q);
return q;
}
blk_put_queue(q);
return NULL;
}
Block/blk-core.c
static int __make_request(struct request_queue *q, struct bio *bio)
{
struct request *req;
int el_ret;
unsigned int bytes = bio->bi_size;
const unsigned short prio = bio_prio(bio);
const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
int rw_flags;
if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
(q->next_ordered == QUEUE_ORDERED_NONE)) {
bio_endio(bio, -EOPNOTSUPP);
return 0;
}
blk_queue_bounce(q, &bio);
spin_lock_irq(q->queue_lock);
if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
goto get_rq;
el_ret = elv_merge(q, &req, bio);
switch (el_ret) {
case ELEVATOR_BACK_MERGE:
BUG_ON(!rq_mergeable(req));
if (!ll_back_merge_fn(q, req, bio))
break;
trace_block_bio_backmerge(q, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bytes;
req->ioprio = ioprio_best(req->ioprio, prio);
if (!blk_rq_cpu_valid(req))
req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
case ELEVATOR_FRONT_MERGE:
BUG_ON(!rq_mergeable(req));
if (!ll_front_merge_fn(q, req, bio))
break;
trace_block_bio_frontmerge(q, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
blk_rq_set_mixed_merge(req);
req->cmd_flags &= ~REQ_FAILFAST_MASK;
req->cmd_flags |= ff;
}
bio->bi_next = req->bio;
req->bio = bio;
req->buffer = bio_data(bio);
req->__sector = bio->bi_sector;
req->__data_len += bytes;
req->ioprio = ioprio_best(req->ioprio, prio);
if (!blk_rq_cpu_valid(req))
req->cpu = bio->bi_comp_cpu;
drive_stat_acct(req, 0);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
default:
;
}
get_rq:
rw_flags = bio_data_dir(bio);
if (sync)
rw_flags |= REQ_RW_SYNC;
req = get_request_wait(q, rw_flags, bio);
init_request_from_bio(req, bio);
spin_lock_irq(q->queue_lock);
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
bio_flagged(bio, BIO_CPU_AFFINE))
req->cpu = blk_cpu_to_group(smp_processor_id());
if (queue_should_plug(q) && elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req);
out:
if (unplug || !queue_should_plug(q))
__generic_unplug_device(q);
spin_unlock_irq(q->queue_lock);
return 0;
}
特别关注add_request和__generic_unplug_device
add_request将会执行电梯调度算法中的具体流程
Block/blk-core.c
static inline void add_request(struct request_queue *q, struct request *req)
{
drive_stat_acct(req, 1);
__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
}
Block/elevator.c
void __elv_add_request(struct request_queue *q, struct request *rq, int where,
int plug)
{
if (q->ordcolor)
rq->cmd_flags |= REQ_ORDERED_COLOR;
if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
if (blk_barrier_rq(rq))
q->ordcolor ^= 1;
if (where == ELEVATOR_INSERT_SORT)
where = ELEVATOR_INSERT_BACK;
if (blk_fs_request(rq) || blk_discard_rq(rq)) {
q->end_sector = rq_end_sector(rq);
q->boundary_rq = rq;
}
} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
where == ELEVATOR_INSERT_SORT)
where = ELEVATOR_INSERT_BACK;
if (plug)
blk_plug_device(q);
elv_insert(q, rq, where);
}
Block/elevator.c
void elv_insert(struct request_queue *q, struct request *rq, int where)
{
struct list_head *pos;
unsigned ordseq;
int unplug_it = 1;
trace_block_rq_insert(q, rq);
rq->q = q;
switch (where) {
case ELEVATOR_INSERT_FRONT:
rq->cmd_flags |= REQ_SOFTBARRIER;
list_add(&rq->queuelist, &q->queue_head);
break;
case ELEVATOR_INSERT_BACK:
rq->cmd_flags |= REQ_SOFTBARRIER;
elv_drain_elevator(q);
list_add_tail(&rq->queuelist, &q->queue_head);
__blk_run_queue(q);
break;
case ELEVATOR_INSERT_SORT:
BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
rq->cmd_flags |= REQ_SORTED;
q->nr_sorted++;
if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq);
if (!q->last_merge)
q->last_merge = rq;
}
q->elevator->ops->elevator_add_req_fn(q, rq);
break;
case ELEVATOR_INSERT_REQUEUE:
rq->cmd_flags |= REQ_SOFTBARRIER;
unplug_it = 0;
if (q->ordseq == 0) {
list_add(&rq->queuelist, &q->queue_head);
break;
}
ordseq = blk_ordered_req_seq(rq);
list_for_each(pos, &q->queue_head) {
struct request *pos_rq = list_entry_rq(pos);
if (ordseq <= blk_ordered_req_seq(pos_rq))
break;
}
list_add_tail(&rq->queuelist, pos);
break;
default:
printk(KERN_ERR "%s: bad insertion point %d\n",
__func__, where);
BUG();
}
if (unplug_it && blk_queue_plugged(q)) {
int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
- queue_in_flight(q);
if (nrq >= q->unplug_thresh)
_generic_unplug_device(q);
}
}
从io调度层取出request是__generic_unplug_device完成
Block/blk-core.c
void __generic_unplug_device(struct request_queue *q)
{
if (unlikely(blk_queue_stopped(q)))
return;
if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
return;
q->request_fn(q);//设备函数,例如scsi设备
}
request_fn是特定的设备函数,类似scsi,它将会通过scsi_dispatch_cmd将scisi指令发送到设备
那么怎么知道io请求已经完成了呢?
硬件驱动中也会提供io complete的函数,它们最终都会执行blk_complete_request
Block/blk-softirq.c
void blk_complete_request(struct request *req)
{
if (unlikely(blk_should_fake_timeout(req->q)))
return;
if (!blk_mark_rq_complete(req))
__blk_complete_request(req);
}
Block/blk-softirq.c
void __blk_complete_request(struct request *req)
{
struct request_queue *q = req->q;
unsigned long flags;
int ccpu, cpu, group_cpu;
BUG_ON(!q->softirq_done_fn);
local_irq_save(flags);
cpu = smp_processor_id();
group_cpu = blk_cpu_to_group(cpu);
/*
* Select completion CPU
*/
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
ccpu = req->cpu;
else
ccpu = cpu;
if (ccpu == cpu || ccpu == group_cpu) {
struct list_head *list;
do_local:
list = &__get_cpu_var(blk_cpu_done);
list_add_tail(&req->csd.list, list);
if (list->next == &req->csd.list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
} else if (raise_blk_irq(ccpu, req))
goto do_local;
local_irq_restore(flags);
}
blk_complete_request是硬件中断上下文的最后一个函数,它把最后io完成后需要完成的工作交给了软中断BLOCK_SOFTIRQ
在将IO请求交给软中断处理后,驱动层完成处理后,将会执行mpage_bio_submit中注册的bio的end_io,它注册为mpage_end_io_read/ mpage_end_io_write