Linux 文件系统读-ext2流程

最新推荐文章于 2022-03-03 14:31:16 发布

weixin_42318651

最新推荐文章于 2022-03-03 14:31:16 发布

阅读量347

点赞数

分类专栏：内存管理

本文链接：https://blog.csdn.net/weixin_42318651/article/details/108436964

版权

内存管理专栏收录该内容

8 篇文章 1 订阅

订阅专栏

文件系统读流程分析，EXT2为例，读代码流程如下：
在这里插入图片描述

读流程：

read_write.c	\fs	
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	struct fd f = fdget_pos(fd);
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos = file_pos_read(f.file);
		ret = vfs_read(f.file, buf, count, &pos);  //调用读函数
		if (ret >= 0)
			file_pos_write(f.file, pos);  //重新设置文件偏移
		fdput_pos(f);
	}
	return ret;
}

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;
	
	if (!(file->f_mode & FMODE_READ))  //权限判断
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))
		return -EINVAL;
	if (unlikely(!access_ok(VERIFY_WRITE, buf, count)))
		return -EFAULT;

	ret = rw_verify_area(READ, file, pos, count);
	if (ret >= 0) {
		count = ret;
		ret = __vfs_read(file, buf, count, pos);  //读函数
		if (ret > 0) {
			fsnotify_access(file);
			add_rchar(current, ret);  //task io统计
		}
		inc_syscr(current);           //task io统计
	}

	return ret;
}

ssize_t __vfs_read(struct file *file, char __user *buf, size_t count,
		   loff_t *pos)
{
	ssize_t ret;

	if (file->f_op->read)                             //读函数
		ret = file->f_op->read(file, buf, count, pos);
	else if (file->f_op->aio_read)
		ret = do_sync_read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		ret = new_sync_read(file, buf, count, pos);
	else
		ret = -EINVAL;

	return ret;
}

const struct file_operations ext2_file_operations = {
	.llseek		= generic_file_llseek,
	.read		= new_sync_read,
	.write		= new_sync_write,
	.read_iter	= generic_file_read_iter,
	.write_iter	= generic_file_write_iter,
	.unlocked_ioctl = ext2_ioctl,
#ifdef CONFIG_COMPAT
	.compat_ioctl	= ext2_compat_ioctl,
#endif
	.mmap		= ext2_file_mmap,
	.open		= dquot_file_open,
	.release	= ext2_release_file,
	.fsync		= ext2_fsync,
	.splice_read	= generic_file_splice_read,
	.splice_write	= iter_file_splice_write,
};
ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };  //需要拷贝到用户空间的地址和长度
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);      //初始化内核io控制块
	kiocb.ki_pos = *ppos;               //初始偏移
	kiocb.ki_nbytes = len;				//长度
	iov_iter_init(&iter, READ, &iov, 1, len);  //初始化iov_iter

	ret = filp->f_op->read_iter(&kiocb, &iter);  //读函数
	if (-EIOCBQUEUED == ret)
		ret = wait_on_sync_kiocb(&kiocb);
	*ppos = kiocb.ki_pos;               //重新设置偏移
	return ret;
}

/**
 * generic_file_read_iter - generic filesystem read routine
 * @iocb:	kernel I/O control block
 * @iter:	destination for the data read
 *
 * This is the "read_iter()" routine for all filesystems
 * that can use the page cache directly.
 */
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	struct file *file = iocb->ki_filp;
	ssize_t retval = 0;
	loff_t *ppos = &iocb->ki_pos;
	loff_t pos = *ppos;

	if (io_is_direct(file)) {                        //O_DIRECT直接读写，bypass cache
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
		size_t count = iov_iter_count(iter);
		loff_t size;

		if (!count)
			goto out; /* skip atime */
		size = i_size_read(inode);
		retval = filemap_write_and_wait_range(mapping, pos,
					pos + count - 1);
		if (!retval) {
			struct iov_iter data = *iter;
			retval = mapping->a_ops->direct_IO(READ, iocb, &data, pos);   //O_DIRECT
		}

		if (retval > 0) {
			*ppos = pos + retval;
			iov_iter_advance(iter, retval);
		}

		/*
		 * Btrfs can have a short DIO read if we encounter
		 * compressed extents, so if there was an error, or if
		 * we've already read everything we wanted to, or if
		 * there was a short read because we hit EOF, go ahead
		 * and return.  Otherwise fallthrough to buffered io for
		 * the rest of the read.  Buffered reads will not work for
		 * DAX files, so don't bother trying.
		 */
		if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
		    IS_DAX(inode)) {
			file_accessed(file);
			goto out;
		}
	}

	retval = do_generic_file_read(file, ppos, iter, retval);  //非O_DIRECT，使用页缓存
out:
	return retval;
}


/**
 * do_generic_file_read - generic file read routine
 * @filp:	the file to read
 * @ppos:	current file position
 * @iter:	data destination
 * @written:	already copied
 *
 * This is a generic file read routine, and uses the
 * mapping->a_ops->readpage() function for the actual low-level stuff.
 *
 * This is really ugly. But the goto's actually try to clarify some
 * of the logic when it comes to error handling etc.
 */
static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
		struct iov_iter *iter, ssize_t written)
{
	struct address_space *mapping = filp->f_mapping;
	struct inode *inode = mapping->host;               /* 对应的inode节点 */
	struct file_ra_state *ra = &filp->f_ra;            /* 上次读写 */
	pgoff_t index;
	pgoff_t last_index;
	pgoff_t prev_index;
	unsigned long offset;      /* offset into pagecache page */
	unsigned int prev_offset;
	int error = 0;

	index = *ppos >> PAGE_CACHE_SHIFT;                 /* 本次读起始页下标 */
	prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;     /* 上次读写页下标 */
	prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);  /* 上次读写页内偏移 */
	last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;  /* 本地读写的最后一个页下标 */
	offset = *ppos & ~PAGE_CACHE_MASK;                /* 本次读的第一页的页内偏移 */

	for (;;) {
		struct page *page;
		pgoff_t end_index;
		loff_t isize;
		unsigned long nr, ret;

		/* 预读为两种策略：同步预读和异步预读  */
		cond_resched();
find_page:
		page = find_get_page(mapping, index);       /* 查找index对应的page是否在缓存中，这里面应该会增加引用计数，下面会减少引用计数 */
		if (!page) {                                /* index的page不在缓存中 */
			page_cache_sync_readahead(mapping,      /* 同步预读 */
					ra, filp,
					index, last_index - index);
			page = find_get_page(mapping, index);   /* 返回后再次查找 */
			if (unlikely(page == NULL))
				goto no_cached_page;
		}
		if (PageReadahead(page)) {                 /* 该页是预读标志页 */
			page_cache_async_readahead(mapping,    /* 触发异步预读，读取更多的页，预读提高下次读的效率 */
					ra, filp, page,
					index, last_index - index);
		}
		if (!PageUptodate(page)) {                  /* 页不是最新的 */
			if (inode->i_blkbits == PAGE_CACHE_SHIFT ||
					!mapping->a_ops->is_partially_uptodate)
				goto page_not_up_to_date;  
			if (!trylock_page(page))               /* 尝试lock页失败，跳转到睡眠lock */
				goto page_not_up_to_date;
			/* 这个时候已经lock住页了 */
			/* Did it get truncated before we got the lock? */
			if (!page->mapping)                     /* mapping为空，属于交换分区？？？。这个地方为啥要这么处理 */
				goto page_not_up_to_date_locked;
			/* 显式调用更新函数？？ */
			if (!mapping->a_ops->is_partially_uptodate(page,
							offset, iter->count))
				goto page_not_up_to_date_locked;
			unlock_page(page); /* unlock页 */
		}
		/* 走到这表示当前index对应的page是存在的，且是最新的 */
page_ok:
		/*
		 * i_size must be checked after we know the page is Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */

		isize = i_size_read(inode);   /* 判断当前index是否超过文件总长度 */
		end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
		if (unlikely(!isize || index > end_index)) {
			page_cache_release(page);
			goto out;
		}

		/* nr is the maximum number of bytes to copy from this page */
		nr = PAGE_CACHE_SIZE;
		if (index == end_index) {     /* 如果是最后一页 */
			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;   //nr为本页最长的有效长度
			if (nr <= offset) {                          //读长度异常
				page_cache_release(page);
				goto out;
			}
		}
		nr = nr - offset;  /* nr为剩余长度,即要读取的长度 */

		/* If users can be writing to this page using arbitrary
		 * virtual addresses, take care about potential aliasing
		 * before reading the page on the kernel side.
		 */
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

		/*
		 * When a sequential read accesses a page several times,
		 * only mark it as accessed the first time.
		 */
		if (prev_index != index || offset != prev_offset)
			mark_page_accessed(page);      /* 主动设置page被访问了，这里面会有page在LRU链表中移动！！！ */  
		prev_index = index;

		/*
		 * Ok, we have the page, and it's up-to-date, so
		 * now we can copy it to user space...
		 */

		ret = copy_page_to_iter(page, offset, nr, iter);  /* 从page中拷贝数据 */
		offset += ret;
		index += offset >> PAGE_CACHE_SHIFT;
		offset &= ~PAGE_CACHE_MASK;
		prev_offset = offset;

		page_cache_release(page);       /* 读完之后会尝试释放该页 */
		written += ret;
		if (!iov_iter_count(iter))
			goto out;
		if (ret < nr) {
			error = -EFAULT;
			goto out;
		}
		continue;

page_not_up_to_date:
		/* Get exclusive access to the page ... */
		/* 
		 * 拿到了页，但是页不是最新的（比如正在读），
		 * 读完成之后会先设置Uptodate，然后在unlock_page，会唤醒这里
		 */
		error = lock_page_killable(page);    
		if (unlikely(error))
			goto readpage_error;

page_not_up_to_date_locked:
		/* Did it get truncated before we got the lock? */
		if (!page->mapping) {
			unlock_page(page);
			page_cache_release(page);
			continue;
		}

		/* Did somebody else fill it already? */
		if (PageUptodate(page)) {
			unlock_page(page);
			goto page_ok;
		}

readpage:
		/*
		 * A previous I/O error may have been due to temporary
		 * failures, eg. multipath errors.
		 * PG_error will be set again if readpage fails.
		 */
		ClearPageError(page);
		/* Start the actual read. The read will unlock the page. */
		error = mapping->a_ops->readpage(filp, page);    //触发读页

		if (unlikely(error)) {
			if (error == AOP_TRUNCATED_PAGE) {
				page_cache_release(page);
				error = 0;
				goto find_page;
			}
			goto readpage_error;
		}

		if (!PageUptodate(page)) {
			error = lock_page_killable(page);
			if (unlikely(error))
				goto readpage_error;
			if (!PageUptodate(page)) {
				if (page->mapping == NULL) {
					/*
					 * invalidate_mapping_pages got it
					 */
					unlock_page(page);
					page_cache_release(page);
					goto find_page;
				}
				unlock_page(page);
				shrink_readahead_size_eio(filp, ra);
				error = -EIO;
				goto readpage_error;
			}
			unlock_page(page);
		}

		goto page_ok;

readpage_error:
		/* UHHUH! A synchronous read error occurred. Report it */
		page_cache_release(page);
		goto out;

no_cached_page:
		/*
		 * Ok, it wasn't cached, so we need to create a new
		 * page..
		 */
		page = page_cache_alloc_cold(mapping);   //申请一个冷页
		if (!page) {
			error = -ENOMEM;
			goto out;
		}
		error = add_to_page_cache_lru(page, mapping,  //添加到lru链表中
						index, GFP_KERNEL);
		if (error) {
			page_cache_release(page);
			if (error == -EEXIST) {
				error = 0;
				goto find_page;
			}
			goto out;
		}
		goto readpage;  //返回继续读
	}

out:
	ra->prev_pos = prev_index;
	ra->prev_pos <<= PAGE_CACHE_SHIFT;
	ra->prev_pos |= prev_offset;

	*ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
	file_accessed(filp);
	return written ? written : error;
}

void page_cache_sync_readahead(struct address_space *mapping,
			       struct file_ra_state *ra, struct file *filp,
			       pgoff_t offset, unsigned long req_size)  //offset 起始页index, req_size 页数量
{
	/* no read-ahead */
	if (!ra->ra_pages)   //最大预读页数
		return;

	/* be dumb */
	if (filp && (filp->f_mode & FMODE_RANDOM)) {            /* 如果设置了随机标志,就不需要预读了，因为预读是顺序读 */
		force_page_cache_readahead(mapping, filp, offset, req_size);
		return;
	}

	/* do read-ahead */
	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
}

/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static unsigned long
ondemand_readahead(struct address_space *mapping,
		   struct file_ra_state *ra, struct file *filp,
		   bool hit_readahead_marker, pgoff_t offset,
		   unsigned long req_size)
{
	unsigned long max = max_sane_readahead(ra->ra_pages);   /* 最大512页，ra->ra_pages  */
	pgoff_t prev_offset;


	//分为顺序读、连续顺序读、随机读
	/*
	 * start of file
	 */
	if (!offset)           //从第一页读，为顺序读
		goto initial_readahead;  


	//和上一次的readahead的page连续或者命中PG_readahead
	/*
	 * It's the expected callback offset, assume sequential access.
	 * Ramp up sizes, and push forward the readahead window.
	 */
	 //1：本次读的起始页为上一次预读的起始页  （两个判断应该是同一意思）
	if ((offset == (ra->start + ra->size - ra->async_size) ||
	     offset == (ra->start + ra->size))) {
		ra->start += ra->size;                 //start
		ra->size = get_next_ra_size(ra, max);  //重新计算总大小
		ra->async_size = ra->size;             //预读大小为总大小
		goto readit;
	}

	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
	if (hit_readahead_marker) {  //命中PG_readahead 
		pgoff_t start;

		rcu_read_lock();
		start = page_cache_next_hole(mapping, offset + 1, max);
		rcu_read_unlock();

		if (!start || start - offset > max)
			return 0;

		ra->start = start;
		ra->size = start - offset;	/* old async_size */
		ra->size += req_size;
		ra->size = get_next_ra_size(ra, max);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * oversize read
	 */
	if (req_size > max)
		goto initial_readahead;

	/*
	 * sequential cache miss
	 * trivial case: (offset - prev_offset) == 1
	 * unaligned reads: (offset - prev_offset) == 0
	 */
	prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;   //当前页与上一次读的页相邻,顺序读
	if (offset - prev_offset <= 1UL)
		goto initial_readahead;

	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	if (try_context_readahead(mapping, ra, offset, req_size, max))   //和historypages连续
		goto readit;

	//到这表示是随机读写，与上一次读没关系
	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
	return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);

initial_readahead:  //顺序读
	ra->start = offset;
	ra->size = get_init_ra_size(req_size, max);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
	 */
	if (offset == ra->start && ra->size == ra->async_size) {
		ra->async_size = get_next_ra_size(ra, max);
		ra->size += ra->async_size;
	}

	return ra_submit(ra, mapping, filp);
}
static inline unsigned long ra_submit(struct file_ra_state *ra,
		struct address_space *mapping, struct file *filp)
{
	return __do_page_cache_readahead(mapping, filp,
					ra->start, ra->size, ra->async_size);
}
int __do_page_cache_readahead(struct address_space *mapping, struct file *filp,
			pgoff_t offset, unsigned long nr_to_read,
			unsigned long lookahead_size)
{
	struct inode *inode = mapping->host;
	struct page *page;
	unsigned long end_index;	/* The last page we want to read */
	LIST_HEAD(page_pool);
	int page_idx;
	int ret = 0;
	loff_t isize = i_size_read(inode);

	if (isize == 0)
		goto out;

	end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);

	/*
	 * Preallocate as many pages as we will need.
	 */
	for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
		pgoff_t page_offset = offset + page_idx;

		if (page_offset > end_index)
			break;

		rcu_read_lock();
		page = radix_tree_lookup(&mapping->page_tree, page_offset);
		rcu_read_unlock();
		if (page && !radix_tree_exceptional_entry(page))
			continue;

		page = page_cache_alloc_readahead(mapping);  /* 分配page */
		if (!page)
			break;
		page->index = page_offset;                   /* 页内偏移 */
		list_add(&page->lru, &page_pool);            /* page 加入 page_pool */
		if (page_idx == nr_to_read - lookahead_size)
			SetPageReadahead(page);  /* 设置预读标志 */
		ret++;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	if (ret)
		read_pages(mapping, filp, &page_pool, ret);
	BUG_ON(!list_empty(&page_pool));
out:
	return ret;
}
static int read_pages(struct address_space *mapping, struct file *filp,
		struct list_head *pages, unsigned nr_pages)
{
	struct blk_plug plug;
	unsigned page_idx;
	int ret;

	/* 申请plug队列，request暂时先连接到该队列中，先蓄洪。睡眠、数量大于16、finish会泄洪 */
	blk_start_plug(&plug);  

	if (mapping->a_ops->readpages) {/* 调用文件系统读函数readpages，多页一起读写，然后直接返回 */
		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);  //调用文件系统相关的读函数，转换成request请求
		/* Clean up the remaining pages */
		put_pages_list(pages);
		goto out;
	}

	/* 没有定义文件系统多页读函数，才会运行到这里，调用readpage，单个页读写 */
	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
		struct page *page = list_to_page(pages);
		list_del(&page->lru);                       //从lru中删除
		if (!add_to_page_cache_lru(page, mapping,   //加入zone的lru链表
					page->index, GFP_KERNEL)) { 
			mapping->a_ops->readpage(filp, page);
		}
		page_cache_release(page);
	}
	ret = 0;

out:
	/* 泄洪， */
	blk_finish_plug(&plug);

	return ret;
}

static int
ext2_readpages(struct file *file, struct address_space *mapping,
		struct list_head *pages, unsigned nr_pages)
{
	return mpage_readpages(mapping, pages, nr_pages, ext2_get_block);
}
int
mpage_readpages(struct address_space *mapping, struct list_head *pages,
				unsigned nr_pages, get_block_t get_block)
{
	struct bio *bio = NULL;
	unsigned page_idx;
	sector_t last_block_in_bio = 0;
	struct buffer_head map_bh;
	unsigned long first_logical_block = 0;

	map_bh.b_state = 0;
	map_bh.b_size = 0;
	for (page_idx = 0; page_idx < nr_pages; page_idx++) {  //遍历所有页面
		struct page *page = list_entry(pages->prev, struct page, lru);

		prefetchw(&page->flags);
		list_del(&page->lru);
		if (!add_to_page_cache_lru(page, mapping,  /* 这个地方要加入LRU */
					page->index, GFP_KERNEL)) {
			bio = do_mpage_readpage(bio, page,
					nr_pages - page_idx,
					&last_block_in_bio, &map_bh,
					&first_logical_block,
					get_block);
		}
		page_cache_release(page);
	}
	BUG_ON(!list_empty(pages));
	if (bio)
		mpage_bio_submit(READ, bio);
	return 0;
}
EXPORT_SYMBOL(mpage_readpages);


/*
 * This is the worker routine which does all the work of mapping the disk
 * blocks and constructs largest possible bios, submits them for IO if the
 * blocks are not contiguous on the disk.
 *
 * We pass a buffer_head back and forth and use its buffer_mapped() flag to
 * represent the validity of its disk mapping and to decide when to do the next
 * get_block() call.
 */

/*这个函数试图读取文件中的一个page大小的数据，最理想的情况下就是这个page大小 
的数据都是在连续的物理磁盘上面的，然后函数只需要提交一个bio请求就可以获取 
所有的数据，这个函数大部分工作在检查page上所有的物理块是否连续，检查的方法 
就是调用文件系统提供的get_block函数，如果不连续，需要调用block_read_full_page 
函数采用buffer 缓冲区的形式来逐个块获取数据*/ 
/* 
    1、调用get_block函数检查page中是不是所有的物理块都连续 
    2、如果连续调用mpage_bio_submit函数请求整个page的数据 
    3、如果不连续调用block_read_full_page逐个block读取 
*/ 

//处理bio合并
static struct bio *
do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages,
		sector_t *last_block_in_bio, struct buffer_head *map_bh,
		unsigned long *first_logical_block, get_block_t get_block)
{
	struct inode *inode = page->mapping->host;
	const unsigned blkbits = inode->i_blkbits;  //硬盘块大小 1KB
	const unsigned blocks_per_page = PAGE_CACHE_SIZE >> blkbits;
	const unsigned blocksize = 1 << blkbits;
	sector_t block_in_file;
	sector_t last_block;
	sector_t last_block_in_file;
	sector_t blocks[MAX_BUF_PER_PAGE]; //块大小最小为512KB
	unsigned page_block;
	unsigned first_hole = blocks_per_page;
	struct block_device *bdev = NULL;
	int length;
	int fully_mapped = 1;
	unsigned nblocks;
	unsigned relative_block;

	/*如果置位，则该页是块设备页高速缓存的页，也就是该页与描述组成该页的块的缓冲区首
		部链表相关。这意味着该页过去已从磁盘读入过，而且页中的块在磁盘上不是相邻的。跳到
		标号confused处，用一次读一块的方式读该页。*/

	if (page_has_buffers(page))
		goto confused;

	/* 
		block_in_file 本page中的第一个block number 
		last_block 本page中最后一个block 的大小 
		last_block_in_file 文件大小求出文件的最后一个block 大小
	*/ 

	block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
	last_block = block_in_file + nr_pages * blocks_per_page;
	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
	if (last_block > last_block_in_file)
		last_block = last_block_in_file;
	page_block = 0;

	/*
	 * Map blocks using the result from the previous get_blocks call first.
	 */
	nblocks = map_bh->b_size >> blkbits;

	/*对于普通情况mpage_readpage调用下，map_bh只是一个临时变量不会走到 
    下面的分支*/ 
	if (buffer_mapped(map_bh) && block_in_file > *first_logical_block &&
			block_in_file < (*first_logical_block + nblocks)) {
		unsigned map_offset = block_in_file - *first_logical_block;
		unsigned last = nblocks - map_offset;

		for (relative_block = 0; ; relative_block++) {
			if (relative_block == last) {
				clear_buffer_mapped(map_bh);
				break;
			}
			if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr + map_offset +
						relative_block;
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	/*
	 * Then do more get_blocks calls until we are done with this page.
	 */
	map_bh->b_page = page;

	/*这个循环是比较关键的路径，理解这个函数至关重要 
    1、page_block从0开始循环，它表示在这个page内的block大小 
    2、调用get_block  函数查找对应逻辑块的物理块号是多少 
    3、如果遇到了文件空洞、page上的物理块不连续就会跳转到confused 
    4、将这个page中每个逻辑块对应的物理块都保存到临时的数组blocks[] 中*/ 
    
	while (page_block < blocks_per_page) {  //0 4 
		map_bh->b_state = 0; 
		map_bh->b_size = 0;

		if (block_in_file < last_block) {
			map_bh->b_size = (last_block-block_in_file) << blkbits; //几块
			if (get_block(inode, block_in_file, map_bh, 0))  //返回值为0，表示正常，b_size被修改为连续的块的字节数
				goto confused;              //页中连续的块在磁盘中并不连续
			*first_logical_block = block_in_file;
		}
		
		//bh没有被映射，可能是一个文件空洞
		if (!buffer_mapped(map_bh)) {
			fully_mapped = 0;
			if (first_hole == blocks_per_page)
				first_hole = page_block;
			page_block++;
			block_in_file++;
			continue;
		}

		/* some filesystems will copy data into the page during
		 * the get_block call, in which case we don't want to
		 * read it again.  map_buffer_to_page copies the data
		 * we just collected from get_block into the page's buffers
		 * so readpage doesn't have to repeat the get_block call
		 */
		 //如果块缓存区是最新的，将其数据直接copy到page
		if (buffer_uptodate(map_bh)) {
			map_buffer_to_page(page, map_bh, page_block);
			goto confused;
		}
	
		if (first_hole != blocks_per_page)
			goto confused;		/* hole -> non-hole */

		/* Contiguous blocks? */
		if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1) //page_block非0，表示第二次进入
			goto confused;
		
		nblocks = map_bh->b_size >> blkbits; //b_size已经被修改了
		for (relative_block = 0; ; relative_block++) {
			if (relative_block == nblocks) {
				clear_buffer_mapped(map_bh);
				break;
			} else if (page_block == blocks_per_page)
				break;
			blocks[page_block] = map_bh->b_blocknr+relative_block;  //存储在blocks中，b_blocknr为磁盘上的块号
			page_block++;
			block_in_file++;
		}
		bdev = map_bh->b_bdev;
	}

	//blocks存储的是这4个块的硬盘上的块号

	/*如果发现文件中有洞，将整个page清0，因为文件洞的区域 
		物理层不会真的去磁盘上读取，必须在这里主动清零，否则 
		文件洞区域内容可能随机*/
		
	/*！！！运行至这，说明页中的所有块在磁盘上是相邻的。然而，它可能是文件中的最后一页，因
		此页中的一些块可能在磁盘上没有映像。如果这样的话，将页中相应的块缓冲区填上0；如果
		不是这样，将页描述符的标志PG_mappedtodisk置位。*/ 

	if (first_hole != blocks_per_page) {
		zero_user_segment(page, first_hole << blkbits, PAGE_CACHE_SIZE);
		if (first_hole == 0) {
			SetPageUptodate(page);
			unlock_page(page);
			goto out;
		}
	} else if (fully_mapped) {
		SetPageMappedToDisk(page);
	}

	if (fully_mapped && blocks_per_page == 1 && !PageUptodate(page) &&
	    cleancache_get_page(page) == 0) {
		SetPageUptodate(page);
		goto confused;
	}

	/*
	 * This page will go to BIO.  Do we need to send this BIO off first?
	 */
	if (bio && (*last_block_in_bio != blocks[0] - 1))
		bio = mpage_bio_submit(READ, bio);

alloc_new:
	if (bio == NULL) {
		if (first_hole == blocks_per_page) {
			if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9),
								page))
				goto out;
		}
		/*重新分配一个bio结构体 
        blocks[0] << (blkbits - 9) 这个是page中第一个逻辑块的物理块号， 
        转换成物理扇区号*/ 
		bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9),
			  	min_t(int, nr_pages, bio_get_nr_vecs(bdev)),
				GFP_KERNEL);
		if (bio == NULL)
			goto confused;
	}

	length = first_hole << blkbits;
	if (bio_add_page(bio, page, length, 0) < length) {
		bio = mpage_bio_submit(READ, bio);
		goto alloc_new;
	}

	relative_block = block_in_file - *first_logical_block;
	nblocks = map_bh->b_size >> blkbits;
	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
	    (first_hole != blocks_per_page))
		bio = mpage_bio_submit(READ, bio);
	else
		*last_block_in_bio = blocks[blocks_per_page - 1];
out:
		/*一切顺利，整个page中的物理块是相连的，返回一个bio*/ 

	return bio;

confused:

	//！！！运行到这表示page内的块在物理上市不连续的
	if (bio)
		bio = mpage_bio_submit(READ, bio);

	/*page 中的物理块不相连，没有办法一个一个buffer去读取出来 
    */ 
	if (!PageUptodate(page))
	        block_read_full_page(page, get_block);
	else
		unlock_page(page);
	goto out;
}

static struct bio *mpage_bio_submit(int rw, struct bio *bio)
{
	bio->bi_end_io = mpage_end_io;
	guard_bio_eod(rw, bio);
	submit_bio(rw, bio);
	return NULL;
}
void submit_bio(int rw, struct bio *bio)
{
	bio->bi_rw |= rw;

	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio)) {
		unsigned int count;

		if (unlikely(rw & REQ_WRITE_SAME))
			count = bdev_logical_block_size(bio->bi_bdev) >> 9;
		else
			count = bio_sectors(bio);

		if (rw & WRITE) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_iter.bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
			current->comm, task_pid_nr(current),
				(rw & WRITE) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_iter.bi_sector,
				bdevname(bio->bi_bdev, b),
				count);
		}
	}

	//由bio生成request
	generic_make_request(bio);
}

void generic_make_request(struct bio *bio)
{
	struct bio_list bio_list_on_stack;

	if (!generic_make_request_checks(bio))
		return;

	/*
	 * We only want one ->make_request_fn to be active at a time, else
	 * stack usage with stacked devices could be a problem.  So use
	 * current->bio_list to keep a list of requests submited by a
	 * make_request_fn function.  current->bio_list is also used as a
	 * flag to say if generic_make_request is currently active in this
	 * task or not.  If it is NULL, then no make_request is active.  If
	 * it is non-NULL, then a make_request is active, and new requests
	 * should be added at the tail
	 */
	if (current->bio_list) {
		bio_list_add(current->bio_list, bio);
		return;
	}

	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to a pointer to the bio_list_on_stack,
	 * thus initialising the bio_list of new bios to be
	 * added.  ->make_request() may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so remove it from
	 * bio_list, and call into ->make_request() again.
	 */
	BUG_ON(bio->bi_next);
	bio_list_init(&bio_list_on_stack);
	current->bio_list = &bio_list_on_stack;
	do {
		struct request_queue *q = bdev_get_queue(bio->bi_bdev);

		q->make_request_fn(q, bio);  //blk_queue_bio，合并bio到request中 //blk_queue_bio

		bio = bio_list_pop(current->bio_list);
	} while (bio);
	current->bio_list = NULL; /* deactivate */
}

问题：
1：读的页不在内存中，需要等待吗？
是需要等待，正常情况，当分配完page之后，检查页的uptodate属性不符合，需要lock等待页面释放唤醒。
在这里插入图片描述

weixin_42318651

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Linux 文件系统读-ext2流程

文件系统读流程分析，EXT2为例，读代码流程如下：读流程：read_write.c \fs SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count){ struct fd f = fdget_pos(fd); ssize_t ret = -EBADF; if (f.file) { loff_t pos = file_pos_read(f.file); ret = vfs_read(f.fil
复制链接

扫一扫