文件系统中数据读取的详细过程

一直在困惑文件系统sys_read、bio、io调度、硬中断、软中断、io完成通知之间的过程是怎么样的,通过代码的跟踪大致明白过程了

内核态的系统调用与bio的交界处的函数mpage_bio_submit(fs/mpage.c)

bio与io调度的交界处的函数__make_request(block/blk-core.c)

io调度与驱动层的交界函数__generic_unplug_device(block/blk-core.c)

io的返回路径

    在驱动层的io完成之后,注册的设备完成方法中(例如:scsi_done,__scsi_done)中则会执行blk_complete_request

     blk_complete_request是硬件的设备驱动的硬件中断上下文的最后一个函数了

     blk_complete_request将会启动软中断BLOCK_SOFTIRQ

     这个过程向上的通知的过程到达bio层的时候,调用的函数就是 mpage_bio_submit中注册的bio的end_io(mpage_end_io_read| mpage_end_io_write)


代码流程

Fs/mpage.c
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
		sector_t *last_block_in_bio, struct buffer_head *map_bh,
		unsigned long *first_logical_block, get_block_t get_block)
{
	struct inode *inode = page->mapping->host;
	const unsigned blkbits = inode->i_blkbits;
	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
	const unsigned blocksize = 1 << blkbits;
	sector_t block_in_file;
	sector_t last_block;
	sector_t last_block_in_file;
	sector_t blocks[MAX_BUF_PER_PAGE];
	unsigned page_block;
	unsigned first_hole = blocks_per_page;
	struct block_device *bdev = NULL;
	int length;
	int fully_mapped = 1;
	unsigned nblocks;
	unsigned relative_block;

	if (page_has_buffers(page))
		goto confused;

	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
	last_block = block_in_file + nr_pages * blocks_per_page;
	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
	if (last_block > last_block_in_file)
		last_block = last_block_in_file;
	page_block = 0;
	nblocks = map_bh->b_size >> blkbits;
	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
			block_in_file < (*first_logical_block + nblocks)) {
		unsigned map_offset = block_in_file - *first_logical_block;
		unsigned last = nblocks - map_offset;

		for (relative_block = 0; ; relative_block++) {
			if (relative_block == last) {
				clear_buffer_mapped(map_bh);
				break;
			}
			if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr + map_offset +
						relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	
	map_bh->b_page = page;
	while (page_block < blocks_per_page) {
		map_bh->b_state = 0;
		map_bh->b_size = 0;

		if (block_in_file < last_block) {
			map_bh->b_size = (last_block-block_in_file) << blkbits;
			if (get_block(inode, block_in_file, map_bh, 0))
				goto confused;
			*first_logical_block = block_in_file;
		}

		if (!buffer_mapped(map_bh)) {
			fully_mapped = 0;
			if (first_hole == blocks_per_page)
				first_hole = page_block;
			page_block++;
			block_in_file++;
			continue;
		}

		if (buffer_uptodate(map_bh)) {
			map_buffer_to_page(page, map_bh, page_block);
			goto confused;
		}
	
		if (first_hole != blocks_per_page)
			goto confused;		/* hole -> non-hole */

		
		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
			goto confused;
		nblocks = map_bh->b_size >> blkbits;
		for (relative_block = 0; ; relative_block++) {
			if (relative_block == nblocks) {
				clear_buffer_mapped(map_bh);
				break;
			} else if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr+relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	if (first_hole != blocks_per_page) {
		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
		if (first_hole == 0) {
			SetPageUptodate(page);
			unlock_page(page);
			goto out;
		}
	} else if (fully_mapped) {
		SetPageMappedToDisk(page);
	}

	/*
	 * This page will go to BIO.  Do we need to send this BIO off first?
	 */
	if (bio && (*last_block_in_bio != blocks[0] - 1))
		bio = mpage_bio_submit(READ, bio);

alloc_new:
	if (bio == NULL) {
		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
				GFP_KERNEL);
		if (bio == NULL)
			goto confused;
	}

	length = first_hole << blkbits;
	if (bio_add_page(bio, page, length, 0) < length) {
		bio = mpage_bio_submit(READ, bio);
		goto alloc_new;
	}

	relative_block = block_in_file - *first_logical_block;
	nblocks = map_bh->b_size >> blkbits;
	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
	    (first_hole != blocks_per_page))
		bio = mpage_bio_submit(READ, bio);
	else
		*last_block_in_bio = blocks[blocks_per_page - 1];
out:
	return bio;

confused:
	if (bio)
		bio = mpage_bio_submit(READ, bio);
	if (!PageUptodate(page))
	        block_read_full_page(page, get_block);
	else
		unlock_page(page);
	goto out;
}

do_mpage_readpage主要是将page转换为bio

do_mpage_readpage中重点关注mpage_bio_submit和block_read_full_page
Fs/buffer.c
int block_read_full_page(struct page *page, get_block_t *get_block)
{
	struct inode *inode = page->mapping->host;
	sector_t iblock, lblock;
	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
	unsigned int blocksize;
	int nr, i;
	int fully_mapped = 1;

	BUG_ON(!PageLocked(page));
	blocksize = 1 << inode->i_blkbits;
	if (!page_has_buffers(page))
		create_empty_buffers(page, blocksize, 0);
	head = page_buffers(page);

	iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
	lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
	bh = head;
	nr = 0;
	i = 0;

	do {
		if (buffer_uptodate(bh))
			continue;

		if (!buffer_mapped(bh)) {
			int err = 0;

			fully_mapped = 0;
			if (iblock < lblock) {
				WARN_ON(bh->b_size != blocksize);
				err = get_block(inode, iblock, bh, 0);
				if (err)
					SetPageError(page);
			}
			if (!buffer_mapped(bh)) {
				zero_user(page, i * blocksize, blocksize);
				if (!err)
					set_buffer_uptodate(bh);
				continue;
			}
	
			if (buffer_uptodate(bh))
				continue;
		}
		arr[nr++] = bh;
	} while (i++, iblock++, (bh = bh->b_this_page) != head);

	if (fully_mapped)
		SetPageMappedToDisk(page);

	if (!nr) {
		if (!PageError(page))
			SetPageUptodate(page);
		unlock_page(page);
		return 0;
	}

	/* Stage two: lock the buffers */
	for (i = 0; i < nr; i++) {
		bh = arr[i];
		lock_buffer(bh);
		mark_buffer_async_read(bh);
	}
	for (i = 0; i < nr; i++) {
		bh = arr[i];
		if (buffer_uptodate(bh))
			end_buffer_async_read(bh, 1);
		else
			submit_bh(READ, bh);
	}
	return 0;
}
重点关注submit_bh

Fs/buffer.c
int submit_bh(int rw, struct buffer_head * bh)
{
	struct bio *bio;
	int ret = 0;
	if (buffer_ordered(bh) && (rw & WRITE))
		rw |= WRITE_BARRIER;
	if (test_set_buffer_req(bh) && (rw & WRITE))
		clear_buffer_write_io_error(bh);
	bio = bio_alloc(GFP_NOIO, 1);
	bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
	bio->bi_bdev = bh->b_bdev;
	bio->bi_io_vec[0].bv_page = bh->b_page;
	bio->bi_io_vec[0].bv_len = bh->b_size;
	bio->bi_io_vec[0].bv_offset = bh_offset(bh);
	bio->bi_vcnt = 1;
	bio->bi_idx = 0;
	bio->bi_size = bh->b_size;
	bio->bi_end_io = end_bio_bh_io_sync;
	bio->bi_private = bh;
	bio_get(bio);
	submit_bio(rw, bio);
	if (bio_flagged(bio, BIO_EOPNOTSUPP))
		ret = -EOPNOTSUPP;
	bio_put(bio);
	return ret;
}
do_mpage_readpage中执行mpage_bio_submit

static struct bio *mpage_bio_submit(int rw,struct bio *bio)
{
       bio->bi_end_io= mpage_end_io_read;
       if(rw == WRITE)
              bio->bi_end_io= mpage_end_io_write;
       submit_bio(rw, bio);
       returnNULL;
}
Block/blk-core.c
void submit_bio(int rw, struct bio *bio)
{
	int count = bio_sectors(bio);
	bio->bi_rw |= rw;
	if (bio_has_data(bio)) {
		if (rw & WRITE) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
			current->comm, task_pid_nr(current),
				(rw & WRITE) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_sector,
				bdevname(bio->bi_bdev, b));
		}
	}

	generic_make_request(bio);
}
generic_make_request将会把请求递交给io调度层

void generic_make_request(struct bio *bio)
{
	if (current->bio_tail) {
		/* make_request is active */
		*(current->bio_tail) = bio;
		bio->bi_next = NULL;
		current->bio_tail = &bio->bi_next;
		return;
	}
	BUG_ON(bio->bi_next);
	do {
		current->bio_list = bio->bi_next;
		if (bio->bi_next == NULL)
			current->bio_tail = ¤t->bio_list;
		else
			bio->bi_next = NULL;
		__generic_make_request(bio);
		bio = current->bio_list;
	} while (bio);
	current->bio_tail = NULL; /* deactivate */
}

static inline void __generic_make_request(struct bio *bio)
{
	struct request_queue *q;
	sector_t old_sector;
	int ret, nr_sectors = bio_sectors(bio);
	dev_t old_dev;
	int err = -EIO;
	might_sleep();
	if (bio_check_eod(bio, nr_sectors))
		goto end_io;
	old_sector = -1;
	old_dev = 0;
	do {
		char b[BDEVNAME_SIZE];
		q = bdev_get_queue(bio->bi_bdev);
		if (unlikely(!q)) {
			goto end_io;
		}

		if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
			     nr_sectors > queue_max_hw_sectors(q))) {
			goto end_io;
		}
		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
			goto end_io;
		if (should_fail_request(bio))
			goto end_io;
		blk_partition_remap(bio);
		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
			goto end_io;
		if (old_sector != -1)
			trace_block_remap(q, bio, old_dev, old_sector);
		old_sector = bio->bi_sector;
		old_dev = bio->bi_bdev->bd_dev;
		if (bio_check_eod(bio, nr_sectors))
			goto end_io;
		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
		    !blk_queue_discard(q)) {
			err = -EOPNOTSUPP;
			goto end_io;
		}
		trace_block_bio_queue(q, bio);
		ret = q->make_request_fn(q, bio);
	} while (ret);
	return;
end_io:
	bio_endio(bio, err);
}

make_request_fn是何时指定的呢?

需要关注请求如何从page=>bh=>bio=>request=>elevator

void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
{
	q->nr_requests = BLKDEV_MAX_RQ;
	q->make_request_fn = mfn;
	blk_queue_dma_alignment(q, 511);
	blk_queue_congestion_threshold(q);
	q->nr_batching = BLK_BATCH_REQ;
	q->unplug_thresh = 4;		/* hmm */
	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
	if (q->unplug_delay == 0)
		q->unplug_delay = 1;

	q->unplug_timer.function = blk_unplug_timeout;
	q->unplug_timer.data = (unsigned long)q;
	blk_set_default_limits(&q->limits);
	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
	if (!q->queue_lock)
		q->queue_lock = &q->__queue_lock;
	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
}

Block/blk-core.c
struct request_queue * blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
{
	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

	if (!q)
		return NULL;

	q->node = node_id;
	if (blk_init_free_list(q)) {
		kmem_cache_free(blk_requestq_cachep, q);
		return NULL;
	}

	q->request_fn		= rfn;
	q->prep_rq_fn		= NULL;
	q->unplug_fn		= generic_unplug_device;
	q->queue_flags		= QUEUE_FLAG_DEFAULT;
	q->queue_lock		= lock;
	blk_queue_make_request(q, __make_request);
	q->sg_reserved_size = INT_MAX;
	if (!elevator_init(q, NULL)) {
		blk_queue_congestion_threshold(q);
		return q;
	}
	blk_put_queue(q);
	return NULL;
}

Block/blk-core.c
static int __make_request(struct request_queue *q, struct bio *bio)
{
	struct request *req;
	int el_ret;
	unsigned int bytes = bio->bi_size;
	const unsigned short prio = bio_prio(bio);
	const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
	const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
	int rw_flags;

	if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
		bio_endio(bio, -EOPNOTSUPP);
		return 0;
	}
	blk_queue_bounce(q, &bio);

	spin_lock_irq(q->queue_lock);

	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
		goto get_rq;

	el_ret = elv_merge(q, &req, bio);
	switch (el_ret) {
	case ELEVATOR_BACK_MERGE:
		BUG_ON(!rq_mergeable(req));

		if (!ll_back_merge_fn(q, req, bio))
			break;
		trace_block_bio_backmerge(q, bio);
		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
			blk_rq_set_mixed_merge(req);
		req->biotail->bi_next = bio;
		req->biotail = bio;
		req->__data_len += bytes;
		req->ioprio = ioprio_best(req->ioprio, prio);
		if (!blk_rq_cpu_valid(req))
			req->cpu = bio->bi_comp_cpu;
		drive_stat_acct(req, 0);
		if (!attempt_back_merge(q, req))
			elv_merged_request(q, req, el_ret);
		goto out;

	case ELEVATOR_FRONT_MERGE:
		BUG_ON(!rq_mergeable(req));
		if (!ll_front_merge_fn(q, req, bio))
			break;
		trace_block_bio_frontmerge(q, bio);
		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
			blk_rq_set_mixed_merge(req);
			req->cmd_flags &= ~REQ_FAILFAST_MASK;
			req->cmd_flags |= ff;
		}
		bio->bi_next = req->bio;
		req->bio = bio;
		req->buffer = bio_data(bio);
		req->__sector = bio->bi_sector;
		req->__data_len += bytes;
		req->ioprio = ioprio_best(req->ioprio, prio);
		if (!blk_rq_cpu_valid(req))
			req->cpu = bio->bi_comp_cpu;
		drive_stat_acct(req, 0);
		if (!attempt_front_merge(q, req))
			elv_merged_request(q, req, el_ret);
		goto out;
	default:
		;
	}

get_rq:
	
	rw_flags = bio_data_dir(bio);
	if (sync)
		rw_flags |= REQ_RW_SYNC;
	req = get_request_wait(q, rw_flags, bio);
	init_request_from_bio(req, bio);

	spin_lock_irq(q->queue_lock);
	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
	    bio_flagged(bio, BIO_CPU_AFFINE))
		req->cpu = blk_cpu_to_group(smp_processor_id());
	if (queue_should_plug(q) && elv_queue_empty(q))
		blk_plug_device(q);
	add_request(q, req);
out:
	if (unplug || !queue_should_plug(q))
		__generic_unplug_device(q);
	spin_unlock_irq(q->queue_lock);
	return 0;
}
特别关注add_request和__generic_unplug_device

add_request将会执行电梯调度算法中的具体流程

Block/blk-core.c
static inline void add_request(struct request_queue *q, struct request *req)
{
	drive_stat_acct(req, 1);
	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
}

Block/elevator.c
void __elv_add_request(struct request_queue *q, struct request *rq, int where,
		       int plug)
{
	if (q->ordcolor)
		rq->cmd_flags |= REQ_ORDERED_COLOR;

	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
		if (blk_barrier_rq(rq))
			q->ordcolor ^= 1;
		if (where == ELEVATOR_INSERT_SORT)
			where = ELEVATOR_INSERT_BACK;
		if (blk_fs_request(rq) || blk_discard_rq(rq)) {
			q->end_sector = rq_end_sector(rq);
			q->boundary_rq = rq;
		}
	} else if (!(rq->cmd_flags & REQ_ELVPRIV) &&
		    where == ELEVATOR_INSERT_SORT)
		where = ELEVATOR_INSERT_BACK;

	if (plug)
		blk_plug_device(q);

	elv_insert(q, rq, where);
}

Block/elevator.c
void elv_insert(struct request_queue *q, struct request *rq, int where)
{
	struct list_head *pos;
	unsigned ordseq;
	int unplug_it = 1;
	trace_block_rq_insert(q, rq);
	rq->q = q;
	switch (where) {
	case ELEVATOR_INSERT_FRONT:
		rq->cmd_flags |= REQ_SOFTBARRIER;

		list_add(&rq->queuelist, &q->queue_head);
		break;

	case ELEVATOR_INSERT_BACK:
		rq->cmd_flags |= REQ_SOFTBARRIER;
		elv_drain_elevator(q);
		list_add_tail(&rq->queuelist, &q->queue_head);
		__blk_run_queue(q);
		break;

	case ELEVATOR_INSERT_SORT:
		BUG_ON(!blk_fs_request(rq) && !blk_discard_rq(rq));
		rq->cmd_flags |= REQ_SORTED;
		q->nr_sorted++;
		if (rq_mergeable(rq)) {
			elv_rqhash_add(q, rq);
			if (!q->last_merge)
				q->last_merge = rq;
		}
		q->elevator->ops->elevator_add_req_fn(q, rq);
		break;

	case ELEVATOR_INSERT_REQUEUE:
		rq->cmd_flags |= REQ_SOFTBARRIER;
		unplug_it = 0;
		if (q->ordseq == 0) {
			list_add(&rq->queuelist, &q->queue_head);
			break;
		}

		ordseq = blk_ordered_req_seq(rq);

		list_for_each(pos, &q->queue_head) {
			struct request *pos_rq = list_entry_rq(pos);
			if (ordseq <= blk_ordered_req_seq(pos_rq))
				break;
		}
		list_add_tail(&rq->queuelist, pos);
		break;
	default:
		printk(KERN_ERR "%s: bad insertion point %d\n",
		       __func__, where);
		BUG();
	}
	if (unplug_it && blk_queue_plugged(q)) {
		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
				- queue_in_flight(q);
		if (nrq >= q->unplug_thresh)
			_generic_unplug_device(q);
	}
}

从io调度层取出request是__generic_unplug_device完成

Block/blk-core.c
void __generic_unplug_device(struct request_queue *q)
{
	if (unlikely(blk_queue_stopped(q)))
		return;
	if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
		return;

	q->request_fn(q);//设备函数,例如scsi设备
}
request_fn是特定的设备函数,类似scsi,它将会通过scsi_dispatch_cmd将scisi指令发送到设备

那么怎么知道io请求已经完成了呢?

硬件驱动中也会提供io complete的函数,它们最终都会执行blk_complete_request

Block/blk-softirq.c
void blk_complete_request(struct request *req)
{
	if (unlikely(blk_should_fake_timeout(req->q)))
		return;
	if (!blk_mark_rq_complete(req))
		__blk_complete_request(req);
}
Block/blk-softirq.c
void __blk_complete_request(struct request *req)
{
	struct request_queue *q = req->q;
	unsigned long flags;
	int ccpu, cpu, group_cpu;
	BUG_ON(!q->softirq_done_fn);

	local_irq_save(flags);
	cpu = smp_processor_id();
	group_cpu = blk_cpu_to_group(cpu);

	/*
	 * Select completion CPU
	 */
	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && req->cpu != -1)
		ccpu = req->cpu;
	else
		ccpu = cpu;

	if (ccpu == cpu || ccpu == group_cpu) {
		struct list_head *list;
do_local:
		list = &__get_cpu_var(blk_cpu_done);
		list_add_tail(&req->csd.list, list);

		if (list->next == &req->csd.list)
			raise_softirq_irqoff(BLOCK_SOFTIRQ);
	} else if (raise_blk_irq(ccpu, req))
		goto do_local;

	local_irq_restore(flags);
}
blk_complete_request是硬件中断上下文的最后一个函数,它把最后io完成后需要完成的工作交给了软中断BLOCK_SOFTIRQ

在将IO请求交给软中断处理后,驱动层完成处理后,将会执行mpage_bio_submit中注册的bio的end_io,它注册为mpage_end_io_read/ mpage_end_io_write













   


展开阅读全文

没有更多推荐了,返回首页