直接 I/O 与 缓存 I/O
可以参见 https://blog.csdn.net/leacock1991/article/details/108035136
对于 ext4 文件系统,最后调用的是 ext4_file_write_iter,它将 I/O 的调用分成两种情况:
-
第一是直接 I/O
最终调用的是 generic_file_direct_write,这里调用的是 mapping->a_ops->direct_IO,实际调用的是 ext4_direct_IO,往设备层写入数据。 -
第二种是缓存 I/O
最终会将数据从应用拷贝到内存缓存中,但是这个时候,并不执行真正的 I/O 操作。
只将整个页或其中部分标记为脏。写操作由一个 timer 触发,那个时候,才调用 wb_workfn 往硬盘写入页面。- wb_workfn 后的调用链
wb_workfn->wb_do_writeback->wb_writeback->writeback_sb_inodes->__writeback_single_inode->do_writepages。在 do_writepages 中,我们要调用 mapping->a_ops->writepages,但实际调用的是 ext4_writepages,往设备层写入数据。
- wb_workfn 后的调用链
直接 I/O 如何访问块设备?
直接 I/O 调用到 ext4_direct_IO
ext4_direct_IO 函数 与 ext4_direct_IO_write 函数
\linux-4.13.16\fs\ext4\inode.c
static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
ssize_t ret;
......
ret = ext4_direct_IO_write(iocb, iter);
......
}
static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
struct ext4_inode_info *ei = EXT4_I(inode);
ssize_t ret;
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(iter);
......
ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
get_block_func, ext4_end_io_dio, NULL,
dio_flags);
……
}
在 ext4_direct_IO_write 调用 __blockdev_direct_IO,有个参数inode->i_sb->s_bdev,通过当前文件的 inode,可以得到 super_block。这个 super_block 中的 s_bdev,就是https://blog.csdn.net/leacock1991/article/details/108308446中填进去的那个 block_device
__blockdev_direct_IO 会调用 do_blockdev_direct_IO
do_blockdev_direct_IO 函数
\linux-4.13.16\fs\direct-io.c
static inline ssize_t
do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
struct block_device *bdev, struct iov_iter *iter,
get_block_t get_block, dio_iodone_t end_io,
dio_submit_t submit_io, int flags)
{
unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
size_t count = iov_iter_count(iter);
loff_t offset = iocb->ki_pos;
loff_t end = offset + count;
struct dio *dio;
struct dio_submit sdio = { 0, };
struct buffer_head map_bh = { 0, };
......
dio = kmem_cache_alloc(dio_cache, GFP_KERNEL);
dio->flags = flags;
dio->i_size = i_size_read(inode);
dio->inode = inode;
if (iov_iter_rw(iter) == WRITE) {
dio->op = REQ_OP_WRITE;
dio->op_flags = REQ_SYNC | REQ_IDLE;
if (iocb->ki_flags & IOCB_NOWAIT)
dio->op_flags |= REQ_NOWAIT;
} else {
dio->op = REQ_OP_READ;
}
sdio.blkbits = blkbits;
sdio.blkfactor = i_blkbits - blkbits;
sdio.block_in_file = offset >> blkbits;
sdio.get_block = get_block;
dio->end_io = end_io;
sdio.submit_io = submit_io;
sdio.final_block_in_bio = -1;
sdio.next_block_for_io = -1;
dio->iocb = iocb;
dio->refcount = 1;
sdio.iter = iter;
sdio.final_block_in_request =
(offset + iov_iter_count(iter)) >> blkbits;
......
sdio.pages_in_io += iov_iter_npages(iter, INT_MAX);
retval = do_direct_IO(dio, &sdio, &map_bh);
.....
}
在这里面函数里有 struct dio 结构和 struct dio_submit 结构,用来描述将要发生的写入请求。
struct dio 结构
\linux-4.13.16\fs\direct-io.c
/* dio_state communicated between submission path and end_io */
struct dio {
int flags; /* doesn't change */
int op;
int op_flags;
blk_qc_t bio_cookie;
struct block_device *bio_bdev;
struct inode *inode;
loff_t i_size; /* i_size when submitted */
dio_iodone_t *end_io; /* IO completion function */
void *private; /* copy from map_bh.b_private */
/* BIO completion state */
spinlock_t bio_lock; /* protects BIO fields below */
int page_errors; /* errno from get_user_pages() */
int is_async; /* is IO async ? */
bool defer_completion; /* defer AIO completion to workqueue? */
bool should_dirty; /* if pages should be dirtied */
int io_error; /* IO error in completion path */
unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
struct task_struct *waiter; /* waiting task (NULL if none) */
/* AIO related stuff */
struct kiocb *iocb; /* kiocb */
ssize_t result; /* IO result */
/*
* pages[] (and any fields placed after it) are not zeroed out at
* allocation time. Don't add new fields after pages[] unless you
* wish that they not be zeroed.
*/
union {
struct page *pages[DIO_PAGES]; /* page buffer */
struct work_struct complete_work;/* deferred AIO completion */
};
} ____cacheline_aligned_in_smp;
struct bio 是将数据传给块设备的通用传输对象
struct dio_submit 结构
\linux-4.13.16\fs\direct-io.c
struct dio_submit {
struct bio *bio; /* bio under assembly */
unsigned blkbits; /* doesn't change */
unsigned blkfactor; /* When we're using an alignment which
is finer than the filesystem's soft
blocksize, this specifies how much
finer. blkfactor=2 means 1/4-block
alignment. Does not change */
unsigned start_zero_done; /* flag: sub-blocksize zeroing has
been performed at the start of a
write */
int pages_in_io; /* approximate total IO pages */
sector_t block_in_file; /* Current offset into the underlying
file in dio_block units. */
unsigned blocks_available; /* At block_in_file. changes */
int reap_counter; /* rate limit reaping */
sector_t final_block_in_request;/* doesn't change */
int boundary; /* prev block is at a boundary */
get_block_t *get_block; /* block mapping function */
dio_submit_t *submit_io; /* IO submition function */
loff_t logical_offset_in_bio; /* current first logical block in bio */
sector_t final_block_in_bio; /* current final block in bio + 1 */
sector_t next_block_for_io; /* next block to be put under IO,
in dio_blocks units */
/*
* Deferred addition of a page to the dio. These variables are
* private to dio_send_cur_page(), submit_page_section() and
* dio_bio_add_page().
*/
struct page *cur_page; /* The page */
unsigned cur_page_offset; /* Offset into it, in bytes */
unsigned cur_page_len; /* Nr of bytes at cur_page_offset */
sector_t cur_page_block; /* Where it starts */
loff_t cur_page_fs_offset; /* Offset in file */
struct iov_iter *iter;
/*
* Page queue. These variables belong to dio_refill_pages() and
* dio_get_page().
*/
unsigned head; /* next page to process */
unsigned tail; /* last valid page + 1 */
size_t from, to;
};
do_blockdev_direct_IO 函数 会调用 do_direct_IO
do_direct_IO 函数
\linux-4.13.16\fs\direct-io.c
static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
struct buffer_head *map_bh)
{
const unsigned blkbits = sdio->blkbits;
const unsigned i_blkbits = blkbits + sdio->blkfactor;
int ret = 0;
while (sdio->block_in_file < sdio->final_block_in_request) {
struct page *page;
size_t from, to;
page = dio_get_page(dio, sdio);
from = sdio->head ? 0 : sdio->from;
to = (sdio->head == sdio->tail - 1) ? sdio->to : PAGE_SIZE;
sdio->head++;
while (from < to) {
unsigned this_chunk_bytes; /* # of bytes mapped */
unsigned this_chunk_blocks; /* # of blocks */
......
ret = submit_page_section(dio, sdio, page,
from,
this_chunk_bytes,
sdio->next_block_for_io,
map_bh);
......
sdio->next_block_for_io += this_chunk_blocks;
sdio->block_in_file += this_chunk_blocks;
from += this_chunk_bytes;
dio->result += this_chunk_bytes;
sdio->blocks_available -= this_chunk_blocks;
if (sdio->block_in_file == sdio->final_block_in_request)
break;
......
}
}
}
do_direct_IO 里面有两层循环:
- 第一层循环是依次处理这次要写入的所有块
对于每一块,取出对应的内存中的页 page,在这一块中,有写入的起始地址 from 和终止地址 to - 第二层循环就是依次处理 from 到 to 的数据
调用 submit_page_section,提交到块设备层进行写入。
submit_page_section 会调用 dio_bio_submit,进而调用 submit_bio 向块设备层提交数据。
缓存 I/O 如何访问块设备?
缓存 I/O 调用到 ext4_writepages
ext4_writepages 函数
\linux-4.13.16\fs\ext4\inode.c
static int ext4_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
......
struct mpage_da_data mpd;
struct inode *inode = mapping->host;
struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
......
mpd.do_map = 0;
mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
ret = mpage_prepare_extent_to_map(&mpd);
/* Submit prepared bio */
ext4_io_submit(&mpd.io_submit);
......
}
比较重要的一个数据结构 struct mpage_da_data
struct mpage_da_data
\linux-4.13.16\fs\ext4\inode.c
struct mpage_da_data {
struct inode *inode;
......
pgoff_t first_page; /* The first page to write */
pgoff_t next_page; /* Current page to examine */
pgoff_t last_page; /* Last page to examine */
struct ext4_map_blocks map;
struct ext4_io_submit io_submit; /* IO submission data */
unsigned int do_map:1;
};
struct ext4_io_submit {
......
struct bio *io_bio;
ext4_io_end_t *io_end;
sector_t io_next_block;
};
里面有文件的 inode、要写入的页的偏移量,还有一个重要的 struct ext4_io_submit,里面有通用传输对象 bio 相关结构见下
在 ext4_writepages 中,mpage_prepare_extent_to_map 用于初始化这个 struct mpage_da_data 结构
接下来的调用链为:mpage_prepare_extent_to_map->mpage_process_page_bufs->mpage_submit_page->ext4_bio_write_page->io_submit_add_bh。
io_submit_add_bh 函数
\linux-4.13.16\fs\ext4\page-io.c
static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
struct page *page,
struct buffer_head *bh)
{
int ret;
if (io->io_bio && bh->b_blocknr != io->io_next_block) {
submit_and_retry:
ext4_io_submit(io);
}
if (io->io_bio == NULL) {
ret = io_submit_init_bio(io, bh);
if (ret)
return ret;
io->io_bio->bi_write_hint = inode->i_write_hint;
}
ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
wbc_account_io(io->io_wbc, page, bh->b_size);
io->io_next_block++;
return 0;
}
在 io_submit_add_bh 中,此时的 bio 还是空的,因而要调用 io_submit_init_bio,初始化 bio。
io_submit_init_bio 函数
\linux-4.13.16\fs\ext4\page-io.c
static int io_submit_init_bio(struct ext4_io_submit *io,
struct buffer_head *bh)
{
struct bio *bio;
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
if (!bio)
return -ENOMEM;
wbc_init_bio(io->io_wbc, bio);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
return 0;
}
再回到 ext4_writepages 中。在 bio 初始化完之后,要调用 ext4_io_submit,提交 I/O。
ext4_io_submit 函数
\linux-4.13.16\fs\ext4\page-io.c
void ext4_io_submit(struct ext4_io_submit *io)
{
struct bio *bio = io->io_bio;
if (bio) {
int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
REQ_SYNC : 0;
io->io_bio->bi_write_hint = io->io_end->inode->i_write_hint;
bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
submit_bio(io->io_bio);
}
io->io_bio = NULL;
}
ext4_io_submit 又是调用 submit_bio,向块设备层传输数据。这同直接I/O 一样 submit_page_section 会调用 dio_bio_submit,进而调用 submit_bio 向块设备层提交数据。
如何向块设备层提交请求?
不管是直接 I/O,还是缓存 I/O,最后都到了 submit_bio 里面
submit_bio 最后会调用 generic_make_request 如图
\linux-4.13.16\block\blk-core.c
generic_make_request 函数
generic_make_request 函数 最重要的两大逻辑:获取一个请求队列 request_queue [如何向块设备层提交请求?] 和调用这个队列的 make_request_fn 函数。[请求提交与调度]
\linux-4.13.16\block\blk-core.c
blk_qc_t generic_make_request(struct bio *bio)
{
/*
* bio_list_on_stack[0] contains bios submitted by the current
* make_request_fn.
* bio_list_on_stack[1] contains bios that were submitted before
* the current make_request_fn, but that haven't been processed
* yet.
*/
struct bio_list bio_list_on_stack[2];
blk_qc_t ret = BLK_QC_T_NONE;
......
if (current->bio_list) {
bio_list_add(¤t->bio_list[0], bio);
goto out;
}
bio_list_init(&bio_list_on_stack[0]);
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (likely(blk_queue_enter(q, bio->bi_opf & REQ_NOWAIT) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
ret = q->make_request_fn(q, bio);
blk_queue_exit(q);
/* sort new bios into those for a lower level
* and those for the same level
*/
bio_list_init(&lower);
bio_list_init(&same);
while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
if (q == bdev_get_queue(bio->bi_bdev))
bio_list_add(&same, bio);
else
bio_list_add(&lower, bio);
/* now assemble so we handle the lowest level first */
bio_list_merge(&bio_list_on_stack[0], &lower);
bio_list_merge(&bio_list_on_stack[0], &same);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
}
......
bio = bio_list_pop(&bio_list_on_stack[0]);
} while (bio);
current->bio_list = NULL; /* deactivate */
out:
return ret;
}
do-while 中先是获取一个请求队列 request_queue,然后调用这个队列的 make_request_fn 函数
request_queue 结构
\linux-4.13.16\include\linux\blkdev.h
struct request_queue {
/*
* Together with queue_head for cacheline sharing
*/
struct list_head queue_head;
struct request *last_merge;
struct elevator_queue *elevator;
......
request_fn_proc *request_fn;
make_request_fn *make_request_fn;
......
}
从 struct block_device 结构和 struct gendisk 结构 可以发现 每个块设备都有一个请求队列 struct request_queue,用于处理上层发来的请求。
在每个块设备的驱动程序初始化的时候,会生成一个 request_queue。
链表 list_head,保存请求 request
request 结构
\linux-4.13.16\include\linux\blkdev.h
struct request {
struct list_head queuelist;
......
struct request_queue *q;
......
struct bio *bio;
struct bio *biotail;
......
}
每个 request 包括一个链表的 struct bio,有指针指向一头一尾
struct bio 结构
\linux-4.13.16\include\linux\blk_types.h
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
blk_status_t bi_status;
......
struct bvec_iter bi_iter;
unsigned short bi_vcnt; /* how many bio_vec's */
unsigned short bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t __bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
......
};
struct bio_vec {
struct page *bv_page;
unsigned int bv_len;
unsigned int bv_offset;
}
bio 中,bi_next 是链表中的下一项,struct bio_vec 指向一组页面
图片来自极客时间趣谈linux操作系统
在请求队列 request_queue 上,还有两个重要的函数,一个是 make_request_fn 函数,用于生成 request;另一个是 request_fn 函数,用于处理 request。
块设备的初始化
以 scsi 驱动为例, 在初始化设备驱动的时候,调用 scsi_alloc_queue,把 request_fn 设置为 scsi_request_fn,调用 blk_init_allocated_queue->blk_queue_make_request,把 make_request_fn 设置为 blk_queue_bio
scsi_alloc_sdev 调用 scsi_alloc_queue
scsi_alloc_sdev 函数
\drivers\scsi\scsi_scan.c
/**
* scsi_alloc_sdev - allocate and setup a scsi_Device
* @starget: which target to allocate a &scsi_device for
* @lun: which lun
* @hostdata: usually NULL and set by ->slave_alloc instead
*
* Description:
* Allocate, initialize for io, and return a pointer to a scsi_Device.
* Stores the @shost, @channel, @id, and @lun in the scsi_Device, and
* adds scsi_Device to the appropriate list.
*
* Return value:
* scsi_Device pointer, or NULL on failure.
**/
static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
u64 lun, void *hostdata)
{
struct scsi_device *sdev;
sdev = kzalloc(sizeof(*sdev) + shost->transportt->device_size,
GFP_ATOMIC);
......
sdev->request_queue = scsi_alloc_queue(sdev);
......
}
scsi_alloc_queue 函数
\linux-4.13.16\drivers\scsi\scsi_lib.c
struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
{
struct Scsi_Host *shost = sdev->host;
struct request_queue *q;
q = blk_alloc_queue_node(GFP_KERNEL, NUMA_NO_NODE);
if (!q)
return NULL;
q->cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
q->rq_alloc_data = shost;
q->request_fn = scsi_request_fn;
q->init_rq_fn = scsi_init_rq;
q->exit_rq_fn = scsi_exit_rq;
q->initialize_rq_fn = scsi_initialize_rq;
//调用blk_queue_make_request(q, blk_queue_bio);
if (blk_init_allocated_queue(q) < 0) {
blk_cleanup_queue(q);
return NULL;
}
__scsi_init_queue(shost, q);
......
return q
}
scsi_alloc_queue 调用 blk_init_allocated_queue
blk_init_allocated_queue 函数
\linux-4.13.16\block\blk-core.c
int blk_init_allocated_queue(struct request_queue *q)
{
q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size);
......
blk_queue_make_request(q, blk_queue_bio);
......
/* init elevator */
if (elevator_init(q, NULL)) {
......
}
......
}
除了初始化 make_request_fn 函数 ,还要做一件很重要的事情,就是初始化 I/O 的电梯算法。
blk_init_allocated_queue 调用 blk_queue_make_request 将 make_request_fn 设置为 blk_queue_bio
blk_queue_make_request 函数
\linux-4.13.16\block\blk-settings.c
void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
{
/*
* set defaults
*/
q->nr_requests = BLKDEV_MAX_RQ;
q->make_request_fn = mfn;
blk_queue_dma_alignment(q, 511);
blk_queue_congestion_threshold(q);
q->nr_batching = BLK_BATCH_REQ;
blk_set_default_limits(&q->limits);
}
电梯算法简介
电梯算法有很多种类型,定义为 elevator_type
struct elevator_type 结构
\linux-4.13.16\include\linux\elevator.h
/*
* identifies an elevator type, such as AS or deadline
*/
struct elevator_type
{
/* managed by elevator core */
struct kmem_cache *icq_cache;
/* fields provided by elevator implementation */
union {
struct elevator_ops sq;
struct elevator_mq_ops mq;
} ops;
size_t icq_size; /* see iocontext.h */
size_t icq_align; /* ditto */
struct elv_fs_entry *elevator_attrs;
char elevator_name[ELV_NAME_MAX];
struct module *elevator_owner;
bool uses_mq;
#ifdef CONFIG_BLK_DEBUG_FS
const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
#endif
/* managed by elevator core */
char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */
struct list_head list;
};
- struct elevator_type elevator_noop
Noop 调度算法是最简单的 IO 调度算法,它将 IO 请求放入到一个 FIFO 队列中,然后逐个执行这些 IO 请求。 - struct elevator_type iosched_deadline
Deadline 算法要保证每个 IO 请求在一定的时间内一定要被服务到,以此来避免某个请求饥饿。 - struct elevator_type iosched_cfq
CFQ 完全公平调度算法。所有的请求会在多个队列中排序。同一个进程的请求,总是在同一队列中处理。时间片会分配到每个队列,通过轮询算法,保证了 I/O 带宽,以公平的方式,在不同队列之间进行共享。
elevator_init 中会根据名称来指定电梯算法,如果没有选择,那就默认使用 iosched_cfq。
请求提交与调度
回到 generic_make_request函数中
不管是直接 I/O,还是缓存 I/O,最后都到了 submit_bio 里面,submit_bio 会调用 generic_make_request
以 scsi 驱动为例,blk_init_allocated_queue 调用 blk_queue_make_request 将 make_request_fn 设置为 blk_queue_bio
generic_make_request 函数调用队列的 make_request_fn,其实就是调用 blk_queue_bio
blk_queue_bio 函数
\linux-4.13.16\block\blk-core.c
static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
{
struct request *req, *free;
unsigned int request_count = 0;
......
switch (elv_merge(q, &req, bio)) {
case ELEVATOR_BACK_MERGE:
if (!bio_attempt_back_merge(q, req, bio))
break;
elv_bio_merged(q, req, bio);
free = attempt_back_merge(q, req);
if (free)
__blk_put_request(q, free);
else
elv_merged_request(q, req, ELEVATOR_BACK_MERGE);
goto out_unlock;
case ELEVATOR_FRONT_MERGE:
if (!bio_attempt_front_merge(q, req, bio))
break;
elv_bio_merged(q, req, bio);
free = attempt_front_merge(q, req);
if (free)
__blk_put_request(q, free);
else
elv_merged_request(q, req, ELEVATOR_FRONT_MERGE);
goto out_unlock;
default:
break;
}
get_rq:
req = get_request(q, bio->bi_opf, bio, GFP_NOIO);
......
blk_init_request_from_bio(req, bio);
......
add_acct_request(q, req, where);
__blk_run_queue(q);
out_unlock:
......
return BLK_QC_T_NONE;
}
blk_queue_bio 调用 elv_merge 来判断是否有request可以合并
调用 elv_merge 来判断,当前这个 bio 请求是否能够和目前已有的 request 合并起来,成为同一批 I/O 操作,从而提高读取和写入的性能
判断标准和 struct bio 的成员 struct bvec_iter 有关,它里面有两个变量,一个是起始磁盘簇 bi_sector,另一个是大小 bi_size。
elv_merge 函数
\linux-4.13.16\block\elevator.c
enum elv_merge elv_merge(struct request_queue *q, struct request **req,
struct bio *bio)
{
struct elevator_queue *e = q->elevator;
struct request *__rq;
......
if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) {
enum elv_merge ret = blk_try_merge(q->last_merge, bio);
if (ret != ELEVATOR_NO_MERGE) {
*req = q->last_merge;
return ret;
}
}
......
__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
return ELEVATOR_BACK_MERGE;
}
if (e->uses_mq && e->type->ops.mq.request_merge)
return e->type->ops.mq.request_merge(q, req, bio);
else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn)
return e->type->ops.sq.elevator_merge_fn(q, req, bio);
return ELEVATOR_NO_MERGE;
}
elv_merge 尝试了三次合并
- 第一次,它先判断和上一次合并的 request 能不能再次合并,看看能不能赶上马上要走的这部电梯。
在 blk_try_merge 主要做了这样的判断:
如果 blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector,也就是说这个 request 的起始地址加上它的大小(其实是这个 request 的结束地址),如果和 bio 的起始地址能接得上,那就把 bio 放在 request 的最后,称为 ELEVATOR_BACK_MERGE。
如果 blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector,也就是说,这个 request 的起始地址减去 bio 的大小等于 bio 的起始地址,这说明 bio 放在 request 的最前面能够接得上,那就把 bio 放在 request 的最前面,称为 ELEVATOR_FRONT_MERGE。否则,那就不合并,称为 ELEVATOR_NO_MERGE。
blk_try_merge 函数
\linux-4.13.16\block\blk-merge.c
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
{
......
if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
return ELEVATOR_BACK_MERGE;
else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
return ELEVATOR_FRONT_MERGE;
return ELEVATOR_NO_MERGE;
}
-
第二次,如果和上一个合并过的 request 无法合并,那就调用 elv_rqhash_find。
按照 bio 的起始地址查找 request,看有没有能够合并的。如果有的话,因为是按照起始地址找的,应该接在人家的后面,所以是 ELEVATOR_BACK_MERGE。 -
第三次,调用 elevator_merge_fn 试图合并
对于 iosched_cfq,调用的是 cfq_merge。在这里面,cfq_find_rq_fmerge 会调用 elv_rb_find 函数,里面的参数是 bio 的结束地址。
能不能找到可以合并的。如果有的话,因为是按照结束地址找的,应该接在人家前面,所以是 ELEVATOR_FRONT_MERGE。
\linux-4.13.16\block\cfq-iosched.c
static enum elv_merge cfq_merge(struct request_queue *q, struct request **req,
struct bio *bio)
{
struct cfq_data *cfqd = q->elevator->elevator_data;
struct request *__rq;
__rq = cfq_find_rq_fmerge(cfqd, bio);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
return ELEVATOR_FRONT_MERGE;
}
return ELEVATOR_NO_MERGE;
}
static struct request *
cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
{
struct task_struct *tsk = current;
struct cfq_io_cq *cic;
struct cfq_queue *cfqq;
cic = cfq_cic_lookup(cfqd, tsk->io_context);
if (!cic)
return NULL;
cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf));
if (cfqq)
return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
return NUL
}
elv_merge 返回 blk_queue_bio 的时候就知道,应该做哪种类型的合并,接着就要进行真的合并。
如果没有办法合并,那就调用 get_request,创建一个新的 request,调用 blk_init_request_from_bio,将 bio 放到新的 request 里面,然后调用 add_acct_request,把新的 request 加到 request_queue 队列中。
到这里解析完了 generic_make_request 中最重要的两大逻辑:获取一个请求队列 request_queue [如何向块设备层提交请求?] 和调用这个队列的 make_request_fn 函数。[请求提交与调度]
请求的处理
设备驱动程序往设备里面写,调用的是请求队列 request_queue 的另外一个函数 request_fn。
对于 scsi 设备来讲,调用的是 scsi_request_fn
scsi_request_fn 函数
\linux-4.13.16\drivers\scsi\scsi_lib.c
static void scsi_request_fn(struct request_queue *q)
__releases(q->queue_lock)
__acquires(q->queue_lock)
{
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost;
struct scsi_cmnd *cmd;
struct request *req;
/*
* To start with, we keep looping until the queue is empty, or until
* the host is no longer able to accept any more requests.
*/
shost = sdev->host;
for (;;) {
int rtn;
/*
* get next queueable request. We do this early to make sure
* that the request is fully prepared even if we cannot
* accept it.
*/
req = blk_peek_request(q);
......
/*
* Remove the request from the request list.
*/
if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
blk_start_request(req);
.....
cmd = req->special;
......
/*
* Dispatch the command to the low-level driver.
*/
cmd->scsi_done = scsi_done;
rtn = scsi_dispatch_cmd(cmd);
......
}
return;
......
}
这里面是一个 for 无限循环,从 request_queue 中读取 request,然后封装更加底层的指令,给设备控制器下指令,实施真正的 I/O 操作。
总结
块设备的 I/O 操作分为两种,一种是直接 I/O,另一种是缓存 I/O。
无论是哪种 I/O,最终都会调用 submit_bio 提交块设备 I/O 请求。
所谓的写入块设备,I/O 就是将 page cache 里面的数据写入硬盘。
对于每一种块设备,都有一个 gendisk 表示这个设备,它有一个请求队列,这个队列是一系列的 request 对象。每个 request 对象里面包含多个 BIO 对象,指向 page cache。
对于请求队列来讲,还有两个函数:
- 一个函数叫 make_request_fn 函数
用于将请求放入队列。submit_bio 会调用 generic_make_request,然后调用这个函数 - 另一个函数在设备驱动程序里实现,叫 request_fn 函数
用于从队列里面取出请求来,写入外部设备。
图片来自极客时间趣谈linux操作系统
参考资料:
趣谈Linux操作系统(极客时间)链接:
http://gk.link/a/10iXZ
欢迎大家来一起交流学习