文件系统读流程分析,EXT2为例,读代码流程如下:
读流程:
read_write.c \fs
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos = file_pos_read(f.file);
ret = vfs_read(f.file, buf, count, &pos); //调用读函数
if (ret >= 0)
file_pos_write(f.file, pos); //重新设置文件偏移
fdput_pos(f);
}
return ret;
}
ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_READ)) //权限判断
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
return -EINVAL;
if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
return -EFAULT;
ret = rw_verify_area(READ, file, pos, count);
if (ret >= 0) {
count = ret;
ret = __vfs_read(file, buf, count, pos); //读函数
if (ret > 0) {
fsnotify_access(file);
add_rchar(current, ret); //task io统计
}
inc_syscr(current); //task io统计
}
return ret;
}
ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
loff_t *pos)
{
ssize_t ret;
if (file->f_op->read) //读函数
ret = file->f_op->read(file, buf, count, pos);
else if (file->f_op->aio_read)
ret = do_sync_read(file, buf, count, pos);
else if (file->f_op->read_iter)
ret = new_sync_read(file, buf, count, pos);
else
ret = -EINVAL;
return ret;
}
const struct file_operations ext2_file_operations = {
.llseek = generic_file_llseek,
.read = new_sync_read,
.write = new_sync_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = ext2_compat_ioctl,
#endif
.mmap = ext2_file_mmap,
.open = dquot_file_open,
.release = ext2_release_file,
.fsync = ext2_fsync,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
struct iovec iov = { .iov_base = buf, .iov_len = len }; //需要拷贝到用户空间的地址和长度
struct kiocb kiocb;
struct iov_iter iter;
ssize_t ret;
init_sync_kiocb(&kiocb, filp); //初始化内核io控制块
kiocb.ki_pos = *ppos; //初始偏移
kiocb.ki_nbytes = len; //长度
iov_iter_init(&iter, READ, &iov, 1, len); //初始化iov_iter
ret = filp->f_op->read_iter(&kiocb, &iter); //读函数
if (-EIOCBQUEUED == ret)
ret = wait_on_sync_kiocb(&kiocb);
*ppos = kiocb.ki_pos; //重新设置偏移
return ret;
}
/**
* generic_file_read_iter - generic filesystem read routine
* @iocb: kernel I/O control block
* @iter: destination for the data read
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
if (io_is_direct(file)) { //O_DIRECT直接读写,bypass cache
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
loff_t size;
if (!count)
goto out; /* skip atime */
size = i_size_read(inode);
retval = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
if (!retval) {
struct iov_iter data = *iter;
retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos); //O_DIRECT
}
if (retval > 0) {
*ppos = pos + retval;
iov_iter_advance(iter, retval);
}
/*
* Btrfs can have a short DIO read if we encounter
* compressed extents, so if there was an error, or if
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
* the rest of the read. Buffered reads will not work for
* DAX files, so don't bother trying.
*/
if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
IS_DAX(inode)) {
file_accessed(file);
goto out;
}
}
retval = do_generic_file_read(file, ppos, iter, retval); //非O_DIRECT,使用页缓存
out:
return retval;
}
/**
* do_generic_file_read - generic file read routine
* @filp: the file to read
* @ppos: current file position
* @iter: data destination
* @written: already copied
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
struct iov_iter *iter, ssize_t written)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host; /* 对应的inode节点 */
struct file_ra_state *ra = &filp->f_ra; /* 上次读写 */
pgoff_t index;
pgoff_t last_index;
pgoff_t prev_index;
unsigned long offset; /* offset into pagecache page */
unsigned int prev_offset;
int error = 0;
index = *ppos >> PAGE_CACHE_SHIFT; /* 本次读起始页下标 */
prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT; /* 上次读写页下标 */
prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1); /* 上次读写页内偏移 */
last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; /* 本地读写的最后一个页下标 */
offset = *ppos & ~PAGE_CACHE_MASK; /* 本次读的第一页的页内偏移 */
for (;;) {
struct page *page;
pgoff_t end_index;
loff_t isize;
unsigned long nr, ret;
/* 预读为两种策略:同步预读和异步预读 */
cond_resched();
find_page:
page = find_get_page(mapping, index); /* 查找index对应的page是否在缓存中,这里面应该会增加引用计数,下面会减少引用计数 */
if (!page) { /* index的page不在缓存中 */
page_cache_sync_readahead(mapping, /* 同步预读 */
ra, filp,
index, last_index - index);
page = find_get_page(mapping, index); /* 返回后再次查找 */
if (unlikely(page == NULL))
goto no_cached_page;
}
if (PageReadahead(page)) { /* 该页是预读标志页 */
page_cache_async_readahead(mapping, /* 触发异步预读,读取更多的页,预读提高下次读的效率 */
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) { /* 页不是最新的 */
if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
if (!trylock_page(page)) /* 尝试lock页失败,跳转到睡眠lock */
goto page_not_up_to_date;
/* 这个时候已经lock住页了 */
/* Did it get truncated before we got the lock? */
if (!page->mapping) /* mapping为空,属于交换分区???。这个地方为啥要这么处理 */
goto page_not_up_to_date_locked;
/* 显式调用更新函数?? */
if (!mapping->a_ops->is_partially_uptodate(page,
offset, iter->count))
goto page_not_up_to_date_locked;
unlock_page(page); /* unlock页 */
}
/* 走到这表示当前index对应的page是存在的,且是最新的 */
page_ok:
/*
* i_size must be checked after we know the page is Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
isize = i_size_read(inode); /* 判断当前index是否超过文件总长度 */
end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
if (unlikely(!isize || index > end_index)) {
page_cache_release(page);
goto out;
}
/* nr is the maximum number of bytes to copy from this page */
nr = PAGE_CACHE_SIZE;
if (index == end_index) { /* 如果是最后一页 */
nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; //nr为本页最长的有效长度
if (nr <= offset) { //读长度异常
page_cache_release(page);
goto out;
}
}
nr = nr - offset; /* nr为剩余长度,即要读取的长度 */
/* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
* before reading the page on the kernel side.
*/
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
/*
* When a sequential read accesses a page several times,
* only mark it as accessed the first time.
*/
if (prev_index != index || offset != prev_offset)
mark_page_accessed(page); /* 主动设置page被访问了,这里面会有page在LRU链表中移动!!! */
prev_index = index;
/*
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*/
ret = copy_page_to_iter(page, offset, nr, iter); /* 从page中拷贝数据 */
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
prev_offset = offset;
page_cache_release(page); /* 读完之后会尝试释放该页 */
written += ret;
if (!iov_iter_count(iter))
goto out;
if (ret < nr) {
error = -EFAULT;
goto out;
}
continue;
page_not_up_to_date:
/* Get exclusive access to the page ... */
/*
* 拿到了页,但是页不是最新的(比如正在读),
* 读完成之后会先设置Uptodate,然后在unlock_page,会唤醒这里
*/
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
page_not_up_to_date_locked:
/* Did it get truncated before we got the lock? */
if (!page->mapping) {
unlock_page(page);
page_cache_release(page);
continue;
}
/* Did somebody else fill it already? */
if (PageUptodate(page)) {
unlock_page(page);
goto page_ok;
}
readpage:
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
* PG_error will be set again if readpage fails.
*/
ClearPageError(page);
/* Start the actual read. The read will unlock the page. */
error = mapping->a_ops->readpage(filp, page); //触发读页
if (unlikely(error)) {
if (error == AOP_TRUNCATED_PAGE) {
page_cache_release(page);
error = 0;
goto find_page;
}
goto readpage_error;
}
if (!PageUptodate(page)) {
error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
if (page->mapping == NULL) {
/*
* invalidate_mapping_pages got it
*/
unlock_page(page);
page_cache_release(page);
goto find_page;
}
unlock_page(page);
shrink_readahead_size_eio(filp, ra);
error = -EIO;
goto readpage_error;
}
unlock_page(page);
}
goto page_ok;
readpage_error:
/* UHHUH! A synchronous read error occurred. Report it */
page_cache_release(page);
goto out;
no_cached_page:
/*
* Ok, it wasn't cached, so we need to create a new
* page..
*/
page = page_cache_alloc_cold(mapping); //申请一个冷页
if (!page) {
error = -ENOMEM;
goto out;
}
error = add_to_page_cache_lru(page, mapping, //添加到lru链表中
index, GFP_KERNEL);
if (error) {
page_cache_release(page);
if (error == -EEXIST) {
error = 0;
goto find_page;
}
goto out;
}
goto readpage; //返回继续读
}
out:
ra->prev_pos = prev_index;
ra->prev_pos <<= PAGE_CACHE_SHIFT;
ra->prev_pos |= prev_offset;
*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
file_accessed(filp);
return written ? written : error;
}
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
pgoff_t offset, unsigned long req_size) //offset 起始页index, req_size 页数量
{
/* no read-ahead */
if (!ra->ra_pages) //最大预读页数
return;
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) { /* 如果设置了随机标志,就不需要预读了,因为预读是顺序读 */
force_page_cache_readahead(mapping, filp, offset, req_size);
return;
}
/* do read-ahead */
ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
static unsigned long
ondemand_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
bool hit_readahead_marker, pgoff_t offset,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages); /* 最大512页,ra->ra_pages */
pgoff_t prev_offset;
//分为顺序读、连续顺序读、随机读
/*
* start of file
*/
if (!offset) //从第一页读,为顺序读
goto initial_readahead;
//和上一次的readahead的page连续或者命中PG_readahead
/*
* It's the expected callback offset, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
//1:本次读的起始页为上一次预读的起始页 (两个判断应该是同一意思)
if ((offset == (ra->start + ra->size - ra->async_size) ||
offset == (ra->start + ra->size))) {
ra->start += ra->size; //start
ra->size = get_next_ra_size(ra, max); //重新计算总大小
ra->async_size = ra->size; //预读大小为总大小
goto readit;
}
/*
* Hit a marked page without valid readahead state.
* E.g. interleaved reads.
* Query the pagecache for async_size, which normally equals to
* readahead size. Ramp it up and use it as the new readahead size.
*/
if (hit_readahead_marker) { //命中PG_readahead
pgoff_t start;
rcu_read_lock();
start = page_cache_next_hole(mapping, offset + 1, max);
rcu_read_unlock();
if (!start || start - offset > max)
return 0;
ra->start = start;
ra->size = start - offset; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max);
ra->async_size = ra->size;
goto readit;
}
/*
* oversize read
*/
if (req_size > max)
goto initial_readahead;
/*
* sequential cache miss
* trivial case: (offset - prev_offset) == 1
* unaligned reads: (offset - prev_offset) == 0
*/
prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; //当前页与上一次读的页相邻,顺序读
if (offset - prev_offset <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
if (try_context_readahead(mapping, ra, offset, req_size, max)) //和historypages连续
goto readit;
//到这表示是随机读写,与上一次读没关系
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
initial_readahead: //顺序读
ra->start = offset;
ra->size = get_init_ra_size(req_size, max);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
readit:
/*
* Will this read hit the readahead marker made by itself?
* If so, trigger the readahead marker hit now, and merge
* the resulted next readahead window into the current one.
*/
if (offset == ra->start && ra->size == ra->async_size) {
ra->async_size = get_next_ra_size(ra, max);
ra->size += ra->async_size;
}
return ra_submit(ra, mapping, filp);
}
static inline unsigned long ra_submit(struct file_ra_state *ra,
struct address_space *mapping, struct file *filp)
{
return __do_page_cache_readahead(mapping, filp,
ra->start, ra->size, ra->async_size);
}
int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
pgoff_t offset, unsigned long nr_to_read,
unsigned long lookahead_size)
{
struct inode *inode = mapping->host;
struct page *page;
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
int ret = 0;
loff_t isize = i_size_read(inode);
if (isize == 0)
goto out;
end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
/*
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
pgoff_t page_offset = offset + page_idx;
if (page_offset > end_index)
break;
rcu_read_lock();
page = radix_tree_lookup(&mapping->page_tree, page_offset);
rcu_read_unlock();
if (page && !radix_tree_exceptional_entry(page))
continue;
page = page_cache_alloc_readahead(mapping); /* 分配page */
if (!page)
break;
page->index = page_offset; /* 页内偏移 */
list_add(&page->lru, &page_pool); /* page 加入 page_pool */
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page); /* 设置预读标志 */
ret++;
}
/*
* Now start the IO. We ignore I/O errors - if the page is not
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
if (ret)
read_pages(mapping, filp, &page_pool, ret);
BUG_ON(!list_empty(&page_pool));
out:
return ret;
}
static int read_pages(struct address_space *mapping, struct file *filp,
struct list_head *pages, unsigned nr_pages)
{
struct blk_plug plug;
unsigned page_idx;
int ret;
/* 申请plug队列,request暂时先连接到该队列中,先蓄洪。睡眠、数量大于16、finish会泄洪 */
blk_start_plug(&plug);
if (mapping->a_ops->readpages) {/* 调用文件系统读函数readpages,多页一起读写,然后直接返回 */
ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages); //调用文件系统相关的读函数,转换成request请求
/* Clean up the remaining pages */
put_pages_list(pages);
goto out;
}
/* 没有定义文件系统多页读函数,才会运行到这里,调用readpage,单个页读写 */
for (page_idx = 0; page_idx < nr_pages; page_idx++) {
struct page *page = list_to_page(pages);
list_del(&page->lru); //从lru中删除
if (!add_to_page_cache_lru(page, mapping, //加入zone的lru链表
page->index, GFP_KERNEL)) {
mapping->a_ops->readpage(filp, page);
}
page_cache_release(page);
}
ret = 0;
out:
/* 泄洪, */
blk_finish_plug(&plug);
return ret;
}
static int
ext2_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
}
int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
unsigned nr_pages, get_block_t get_block)
{
struct bio *bio = NULL;
unsigned page_idx;
sector_t last_block_in_bio = 0;
struct buffer_head map_bh;
unsigned long first_logical_block = 0;
map_bh.b_state = 0;
map_bh.b_size = 0;
for (page_idx = 0; page_idx < nr_pages; page_idx++) { //遍历所有页面
struct page *page = list_entry(pages->prev, struct page, lru);
prefetchw(&page->flags);
list_del(&page->lru);
if (!add_to_page_cache_lru(page, mapping, /* 这个地方要加入LRU */
page->index, GFP_KERNEL)) {
bio = do_mpage_readpage(bio, page,
nr_pages - page_idx,
&last_block_in_bio, &map_bh,
&first_logical_block,
get_block);
}
page_cache_release(page);
}
BUG_ON(!list_empty(pages));
if (bio)
mpage_bio_submit(READ, bio);
return 0;
}
EXPORT_SYMBOL(mpage_readpages);
/*
* This is the worker routine which does all the work of mapping the disk
* blocks and constructs largest possible bios, submits them for IO if the
* blocks are not contiguous on the disk.
*
* We pass a buffer_head back and forth and use its buffer_mapped() flag to
* represent the validity of its disk mapping and to decide when to do the next
* get_block() call.
*/
/*这个函数试图读取文件中的一个page大小的数据,最理想的情况下就是这个page大小
的数据都是在连续的物理磁盘上面的,然后函数只需要提交一个bio请求就可以获取
所有的数据,这个函数大部分工作在检查page上所有的物理块是否连续,检查的方法
就是调用文件系统提供的get_block函数,如果不连续,需要调用block_read_full_page
函数采用buffer 缓冲区的形式来逐个块获取数据*/
/*
1、调用get_block函数检查page中是不是所有的物理块都连续
2、如果连续调用mpage_bio_submit函数请求整个page的数据
3、如果不连续调用block_read_full_page逐个block读取
*/
//处理bio合并
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
sector_t *last_block_in_bio, struct buffer_head *map_bh,
unsigned long *first_logical_block, get_block_t get_block)
{
struct inode *inode = page->mapping->host;
const unsigned blkbits = inode->i_blkbits; //硬盘块大小 1KB
const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
const unsigned blocksize = 1 << blkbits;
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
sector_t blocks[MAX_BUF_PER_PAGE]; //块大小最小为512KB
unsigned page_block;
unsigned first_hole = blocks_per_page;
struct block_device *bdev = NULL;
int length;
int fully_mapped = 1;
unsigned nblocks;
unsigned relative_block;
/*如果置位,则该页是块设备页高速缓存的页,也就是该页与描述组成该页的块的缓冲区首
部链表相关。这意味着该页过去已从磁盘读入过,而且页中的块在磁盘上不是相邻的。跳到
标号confused处,用一次读一块的方式读该页。*/
if (page_has_buffers(page))
goto confused;
/*
block_in_file 本page中的第一个block number
last_block 本page中最后一个block 的大小
last_block_in_file 文件大小求出文件的最后一个block 大小
*/
block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
if (last_block > last_block_in_file)
last_block = last_block_in_file;
page_block = 0;
/*
* Map blocks using the result from the previous get_blocks call first.
*/
nblocks = map_bh->b_size >> blkbits;
/*对于普通情况mpage_readpage调用下,map_bh只是一个临时变量不会走到
下面的分支*/
if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
block_in_file < (*first_logical_block + nblocks)) {
unsigned map_offset = block_in_file - *first_logical_block;
unsigned last = nblocks - map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
clear_buffer_mapped(map_bh);
break;
}
if (page_block == blocks_per_page)
break;
blocks[page_block] = map_bh->b_blocknr + map_offset +
relative_block;
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
/*
* Then do more get_blocks calls until we are done with this page.
*/
map_bh->b_page = page;
/*这个循环是比较关键的路径,理解这个函数至关重要
1、page_block从0开始循环,它表示在这个page内的block大小
2、调用get_block 函数查找对应逻辑块的物理块号是多少
3、如果遇到了文件空洞、page上的物理块不连续就会跳转到confused
4、将这个page中每个逻辑块对应的物理块都保存到临时的数组blocks[] 中*/
while (page_block < blocks_per_page) { //0 4
map_bh->b_state = 0;
map_bh->b_size = 0;
if (block_in_file < last_block) {
map_bh->b_size = (last_block-block_in_file) << blkbits; //几块
if (get_block(inode, block_in_file, map_bh, 0)) //返回值为0,表示正常,b_size被修改为连续的块的字节数
goto confused; //页中连续的块在磁盘中并不连续
*first_logical_block = block_in_file;
}
//bh没有被映射,可能是一个文件空洞
if (!buffer_mapped(map_bh)) {
fully_mapped = 0;
if (first_hole == blocks_per_page)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
/* some filesystems will copy data into the page during
* the get_block call, in which case we don't want to
* read it again. map_buffer_to_page copies the data
* we just collected from get_block into the page's buffers
* so readpage doesn't have to repeat the get_block call
*/
//如果块缓存区是最新的,将其数据直接copy到page
if (buffer_uptodate(map_bh)) {
map_buffer_to_page(page, map_bh, page_block);
goto confused;
}
if (first_hole != blocks_per_page)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) //page_block非0,表示第二次进入
goto confused;
nblocks = map_bh->b_size >> blkbits; //b_size已经被修改了
for (relative_block = 0; ; relative_block++) {
if (relative_block == nblocks) {
clear_buffer_mapped(map_bh);
break;
} else if (page_block == blocks_per_page)
break;
blocks[page_block] = map_bh->b_blocknr+relative_block; //存储在blocks中,b_blocknr为磁盘上的块号
page_block++;
block_in_file++;
}
bdev = map_bh->b_bdev;
}
//blocks存储的是这4个块的硬盘上的块号
/*如果发现文件中有洞,将整个page清0,因为文件洞的区域
物理层不会真的去磁盘上读取,必须在这里主动清零,否则
文件洞区域内容可能随机*/
/*!!!运行至这,说明页中的所有块在磁盘上是相邻的。然而,它可能是文件中的最后一页,因
此页中的一些块可能在磁盘上没有映像。如果这样的话,将页中相应的块缓冲区填上0;如果
不是这样,将页描述符的标志PG_mappedtodisk置位。*/
if (first_hole != blocks_per_page) {
zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
if (first_hole == 0) {
SetPageUptodate(page);
unlock_page(page);
goto out;
}
} else if (fully_mapped) {
SetPageMappedToDisk(page);
}
if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
cleancache_get_page(page) == 0) {
SetPageUptodate(page);
goto confused;
}
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
if (bio && (*last_block_in_bio != blocks[0] - 1))
bio = mpage_bio_submit(READ, bio);
alloc_new:
if (bio == NULL) {
if (first_hole == blocks_per_page) {
if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
page))
goto out;
}
/*重新分配一个bio结构体
blocks[0] << (blkbits - 9) 这个是page中第一个逻辑块的物理块号,
转换成物理扇区号*/
bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
GFP_KERNEL);
if (bio == NULL)
goto confused;
}
length = first_hole << blkbits;
if (bio_add_page(bio, page, length, 0) < length) {
bio = mpage_bio_submit(READ, bio);
goto alloc_new;
}
relative_block = block_in_file - *first_logical_block;
nblocks = map_bh->b_size >> blkbits;
if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
(first_hole != blocks_per_page))
bio = mpage_bio_submit(READ, bio);
else
*last_block_in_bio = blocks[blocks_per_page - 1];
out:
/*一切顺利,整个page中的物理块是相连的,返回一个bio*/
return bio;
confused:
//!!!运行到这表示page内的块在物理上市不连续的
if (bio)
bio = mpage_bio_submit(READ, bio);
/*page 中的物理块不相连,没有办法一个一个buffer去读取出来
*/
if (!PageUptodate(page))
block_read_full_page(page, get_block);
else
unlock_page(page);
goto out;
}
static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
guard_bio_eod(rw, bio);
submit_bio(rw, bio);
return NULL;
}
void submit_bio(int rw, struct bio *bio)
{
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;
if (unlikely(rw & REQ_WRITE_SAME))
count = bdev_logical_block_size(bio->bi_bdev) >> 9;
else
count = bio_sectors(bio);
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_iter.bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
//由bio生成request
generic_make_request(bio);
}
void generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
if (!generic_make_request_checks(bio))
return;
/*
* We only want one ->make_request_fn to be active at a time, else
* stack usage with stacked devices could be a problem. So use
* current->bio_list to keep a list of requests submited by a
* make_request_fn function. current->bio_list is also used as a
* flag to say if generic_make_request is currently active in this
* task or not. If it is NULL, then no make_request is active. If
* it is non-NULL, then a make_request is active, and new requests
* should be added at the tail
*/
if (current->bio_list) {
bio_list_add(current->bio_list, bio);
return;
}
/* following loop may be a bit non-obvious, and so deserves some
* explanation.
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
* we assign bio_list to a pointer to the bio_list_on_stack,
* thus initialising the bio_list of new bios to be
* added. ->make_request() may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so remove it from
* bio_list, and call into ->make_request() again.
*/
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
current->bio_list = &bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
q->make_request_fn(q, bio); //blk_queue_bio,合并bio到request中 //blk_queue_bio
bio = bio_list_pop(current->bio_list);
} while (bio);
current->bio_list = NULL; /* deactivate */
}
问题:
1:读的页不在内存中,需要等待吗?
是需要等待,正常情况,当分配完page之后,检查页的uptodate属性不符合,需要lock等待页面释放唤醒。