io调度框架
整体流程
细节展开
io的提交过程
从pagecache到bio到requst
我们从read流程和write流程中都知道,最后都会到readpages或者writepages中来
我们就从readpage开始分析,write的基本上和《Linux-块设备驱动之框架详细分析(详解)》讲的差不多,
也就是会到ll_rw_block函数中,至于怎么到这个函数中,就不求甚解了。
我们来看readpage流程。
static int read_page
s(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
struct blk_plug plug;
unsigned page_idx;
int ret;
blk_start_plug(&plug);
if (mapping->a_ops->readpages) {
put_pages_list(pages);
goto out;
}
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping, page->index,
mapping_gfp_constraint(mapping, GFP_KERNEL))) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
}
ret = 0;
out:
blk_finish_plug(&plug);
return ret;
}
假设我们读取的文件还没有在lru中。就一定会走mapping->a_ops->readpage
我们假设是ext4文件系统。
static const
struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
.readpages = ext4_readpages,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
.write_end = ext4_write_end,
.bmap = ext4_bmap,
.invalidatepage = ext4_invalidatepage,
.releasepage = ext4_releasepage,
.direct_IO = ext4_direct_IO,
.migratepage = buffer_migrate_page,
.is_partially_uptodate = block_is_partially_uptodate,
.error_remove_page = generic_error_remove_page,
};
static int
ext4_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
struct inode *inode = mapping->host;
if (ext4_has_inline_data(inode))
return 0;
return ext4_mpage_readpages(mapping, pages, NULL, nr_pages);
}
没有做什么,就是你要多少pages,就转到ext4_mpage_readpages中来。
int ext4_mpage_readpages(struct address_space *mapping,
struct list_head *pages, struct page *page,
unsigned nr_pages)
{
struct bio *bio = NULL;
unsigned page_idx;
sector_t last_block_in_bio = 0;
struct inode *inode = mapping->host;
const unsigned blkbits = inode->i_blkbits;
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE];
unsigned page_block;
struct block_device *bdev = inode->i_sb->s_bdev;
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
for (page_idx = 0; nr_pages; page_idx++, nr_pages--) {
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
prefetchw(&page->flags);
if (pages) {
page = list_entry(pages->prev, struct page, lru);
list_del(&page->lru);
if (add_to_page_cache_lru(page, mapping, page->index,
mapping_gfp_constraint(mapping, GFP_KERNEL)))
goto next_page;
}
if (page_has_buffers(page))
goto confused;
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
/*
* Map blocks using the previous result first.
*/
if ((map.m_flags & EXT4_MAP_MAPPED) &&
block_in_file > map.m_lblk &&
block_in_file < (map.m_lblk + map.m_len)) {
unsigned map_offset = block_in_file - map.m_lblk;
unsigned last = map.m_len - map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
}
if (page_block == blocks_per_page)
break;
blocks[page_block] = map.m_pblk + map_offset +
relative_block;
page_block++;
block_in_file++;
}
}
/*
* Then do more ext4_map_blocks() calls until we are
* done with this page.
*/
while (page_block < blocks_per_page) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
if (ext4_map_blocks(NULL, inode, &map, 0) < 0) {
set_error_page:
SetPageError(page);
zero_user_segment(page, 0,
PAGE_CACHE_SIZE);
unlock_page(page);
goto next_page;
}
}
if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
if (first_hole != blocks_per_page)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
if (page_block && blocks[page_block-1] != map.m_pblk-1)
goto confused;
for (relative_block = 0; ; relative_block++) {
if (relative_block == map.m_len) {
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
} else if (page_block == blocks_per_page)
break;
blocks[page_block] = map.m_pblk+relative_block;
page_block++;
block_in_file++;
}
}
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits,
PAGE_CACHE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
goto next_page;
}
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
if (fully_mapped && blocks_per_page == 1 &&
!PageUptodate(page) && cleancache_get_page(page) == 0) {
SetPageUptodate(page);
goto confused;
}
/*
* This page will go to BIO. Do we need to send this
* BIO off first?
*/
if (bio && (last_block_in_bio != blocks[0] - 1)) {
submit_and_realloc:
ext4_submit_bio_read(bio);
bio = NULL;
}
if (bio == NULL) {
struct ext4_crypto_ctx *ctx = NULL;
if (ext4_encrypted_inode(inode) &&
S_ISREG(inode->i_mode)) {
ctx = ext4_get_crypto_ctx(inode);
if (IS_ERR(ctx))
goto set_error_page;
}
bio = bio_alloc(GFP_KERNEL,
min_t(int, nr_pages, BIO_MAX_PAGES));
if (!bio) {
if (ctx)
ext4_release_crypto_ctx(ctx);
goto set_error_page;
}
bio->bi_bdev = bdev;
bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
bio->bi_end_io = mpage_end_io;
bio->bi_private = ctx;
}
length = first_hole << blkbits;
if (bio_add_page(bio, page, length, 0) < length)
goto submit_and_realloc;
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
(first_hole != blocks_per_page)) {
ext4_submit_bio_read(bio);
bio = NULL;
} else
last_block_in_bio = blocks[blocks_per_page - 1];
goto next_page;
confused:
if (bio) {
ext4_submit_bio_read(bio);
bio = NULL;
}
if (!PageUptodate(page))
block_read_full_page(page, ext4_get_block);
else
unlock_page(page);
next_page:
if (pages)
page_cache_release(page);
}
BUG_ON(pages && !list_empty(pages));
if (bio)
ext4_submit_bio_read(bio);
return 0;
}
1、前面是做一些结构体的初始化
2、后面就是for循环遍历每个page
3、然后就是说对建立一个bio结构体
4、然后对bio结构体的数据进行初始化
5、建立page和bio的关联。bio_add_page
6、然后是提交这个bio去申请数据ext4_submit_bio_read
这里我们要注意关联的部分
int bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
struct bio_vec *bv;
/*
* cloned bio must not modify vec list
*/
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
if (bio->bi_vcnt > 0) {
bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page == bv->bv_page &&
offset == bv->bv_offset + bv->bv_len) {
bv->bv_len += len;
goto done;
}
}
if (bio->bi_vcnt >= bio->bi_max_vecs)
return 0;
bv = &bio->bi_io_vec[bio->bi_vcnt];
bv->bv_page = page;
bv->bv_len = len;
bv->bv_offset = offset;
bio->bi_vcnt++;
done:
bio->bi_iter.bi_size += len;
return len;
}
关联也很简单就是数据指针。我们通过之前的《2.Linux 块设备驱动代码编写—简单驱动》这个程序里,
pRHdata = pdev->data + (bio->bi_sector * RAMHD_SECTOR_SIZE);
bio_for_each_segment(bvec, bio, i){
pBuffer = kmap(bvec->bv_page) + bvec->bv_offset;
switch(bio_data_dir(bio)){
case READ:
memcpy(pBuffer, pRHdata, bvec->bv_len);
flush_dcache_page(bvec->bv_page);
break;
我们知道,到了驱动,它会把磁盘或者块设备中数据拷贝到bv_page,而这个指针就是我们前面关联的。
另外一种方式的《4.Linux-块设备驱动(详解)》是只有requst的,其实也是一样的,只是封装格式不同而已。
至于最后的
然后是提交这个bio去申请数据ext4_submit_bio_read
最后会到submit_bio这个函数,后面我们会分析。
但是这里是有疑问的,这里每个page进行一次submit_bio,这个就不对了把,每个bio是有链表,很明显这里没有连接在一块。
其实应该猜到了,会在submit_bio 的过程中连接在一块的。
submit_bio
static void
ext4_submit_bio_read(struct bio *bio)
{
if (trace_android_fs_dataread_start_enabled()) {
struct page *first_page = bio->bi_io_vec[0].bv_page;
if (first_page != NULL) {
trace_android_fs_dataread_start(
first_page->mapping->host,
page_offset(first_page),
bio->bi_iter.bi_size,
current->pid,
current->comm);
}
}
submit_bio(READ, bio);
}
上次我们分析到这里ext4_submit_bio_read
后面就到了submit_bio.
block/blk-core.c
blk_qc_t submit_bio(int rw, struct bio *bio)
{
bio->bi_rw |= rw;
if (bio_has_data(bio)) {
unsigned int count;
if (unlikely(rw & REQ_WRITE_SAME))
count = bdev_logical_block_size(bio->bi_bdev) >> 9;
else
count = bio_sectors(bio);
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
mtk_btag_pidlog_submit_bio(bio);
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_iter.bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
return generic_make_request(bio);
}
这里就直接到generic_make_requst,
blk_qc_t generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
blk_qc_t ret = BLK_QC_T_NONE;
if (!generic_make_request_checks(bio))
goto out;
if (current->bio_list) {
bio_list_add(current->bio_list, bio);
goto out;
}
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
current->bio_list = &bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (likely(blk_queue_enter(q, __GFP_DIRECT_RECLAIM) == 0)) {
ret = q->make_request_fn(q, bio);
blk_queue_exit(q);
bio = bio_list_pop(current->bio_list);
} else {
struct bio *bio_next = bio_list_pop(current->bio_list);
bio_io_error(bio);
bio = bio_next;
}
} while (bio);
current->bio_list = NULL; /* deactivate */
out:
return ret;
}
这里面很有意思的就是有个current->bio_list ,类似上篇疑问的解答。