Kernel源码笔记之VFS:6.读文件

Kernel源码笔记目录

读文件

源码基于stable-5.10.102

/*
fd: 文件描述符
buf: 缓冲区
count: 要读入的数量

要读取的位置在file的f_pos里
*/
SYSCALL_DEFINE3(read, unsigned int, fd, char __user *, buf, size_t, count)
{
	return ksys_read(fd, buf, count);
}

ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count)
{
    /* fdget_pos返回值是file对象与标志位.低2位存储标志位,其余位是file指针
		todo: 为什么file低2位是0 */
	struct fd f = fdget_pos(fd);
	ssize_t ret = -EBADF;

    /* 如果没找到文件就出错了 */
	if (f.file) {
		/* 如果是FMODE_STREAM格式返回NULL, 否则返回文件当前读写位置 */
		loff_t pos, *ppos = file_ppos(f.file);
		if (ppos) {
			pos = *ppos;
			ppos = &pos;
		}
		/* ret返回的是读取到的数量 */
		ret = vfs_read(f.file, buf, count, ppos);

		/* 如果读取到了数据,则更新文件的f_pos值*/
		if (ret >= 0 && ppos)
			f.file->f_pos = pos;
		/* 刚才在fdget_pos里增加了引用计数,所以这里要递减 */
		fdput_pos(f);
	}
	return ret;
}

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	/* 如果没有读标志或者不能读,则出错返回 */
	if (!(file->f_mode & FMODE_READ))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))
		return -EINVAL;
	/* 判断用户空间的buf地址是否合法 */
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;
	/* 检查要读取的位置是否合法 */
	ret = rw_verify_area(READ, file, pos, count);
	if (ret)
		return ret;
	/* 
		调整最大读取值为MAX_RW_COUNT
	#define MAX_RW_COUNT (INT_MAX & PAGE_MASK)
		将INT_MAX页对齐.
	*/
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;

	/* 文件系统必须提供read或read_iter指针 */
	if (file->f_op->read)
		ret = file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		/* new_sync_read会创建一个iov来读取 */
		ret = new_sync_read(file, buf, count, pos);
	else
		ret = -EINVAL;
	if (ret > 0) {
		/* 通知文件被访问 */
		fsnotify_access(file);
		/* 统计进程的rchar */
		add_rchar(current, ret);
	}
	/* syscr统计 */
	inc_syscr(current);
	return ret;
}

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
	struct inode *inode;
	int retval = -EINVAL;

	inode = file_inode(file);

	/* 如果要读取的小于0, 则退出 */
	if (unlikely((ssize_t) count < 0))
		return retval;

	if (ppos) {
		loff_t pos = *ppos;

		/* todo: pos怎么会小于0 ? */
		if (unlikely(pos < 0)) {
			/* unsigned_offsets判断file是不是大文件 */
			if (!unsigned_offsets(file))
				return retval;
			if (count >= -pos) /* both values are in 0..LLONG_MAX */
				return -EOVERFLOW;
		} else if (unlikely((loff_t) (pos + count) < 0)) {
			/* 前面已经判断了pos, count小于0的情况,所以走到这里,肯定是pos+count越界了,
			如果不是大文件就出错了.
			*/
			if (!unsigned_offsets(file))
				return retval;
		}

		/* 判断要读的区域是否加了锁,如果加了锁,这个进程必须持有锁,否则访问出错 */
		if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
			retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
					read_write == READ ? F_RDLCK : F_WRLCK);
			if (retval < 0)
				return retval;
		}
	}

	/* 调用 file_permission 钩子函数 */
	return security_file_permission(file,
				read_write == READ ? MAY_READ : MAY_WRITE);
}

static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;

	/* 初始化一个kiocb结构体 */
	init_sync_kiocb(&kiocb, filp);
	kiocb.ki_pos = (ppos ? *ppos : 0);

	/* 初始化 iov_iter */
	iov_iter_init(&iter, READ, &iov, 1, len);

	/* 直接调用文件系统的read_iter方法 */
	ret = call_read_iter(filp, &kiocb, &iter);
	BUG_ON(ret == -EIOCBQUEUED);
	if (ppos)
		*ppos = kiocb.ki_pos;
	return ret;
}

void iov_iter_init(struct iov_iter *i, unsigned int direction,
			const struct iovec *iov, unsigned long nr_segs,
			size_t count)
{
	WARN_ON(direction & ~(READ | WRITE));
	direction &= READ | WRITE;

	if (uaccess_kernel()) { // 内核上下文访问
		i->type = ITER_KVEC | direction;
		i->kvec = (struct kvec *)iov;
	} else {
		i->type = ITER_IOVEC | direction;
		i->iov = iov;
	}
	/* nr_segs在这里传的是1*/
	i->nr_segs = nr_segs;
	i->iov_offset = 0;
	/* 要读取的数量 */
	i->count = count;
}

现在的文件系统提供的读文件函数指针一般都是generic_file_read_iter.

ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	/* 要读取的数量 */
	size_t count = iov_iter_count(iter);
	ssize_t retval = 0;

	/* 读取的数量为0,退出 */
	if (!count)
		goto out; /* skip atime */

	if (iocb->ki_flags & IOCB_DIRECT) { // 直接读取,不经过页缓存
		struct file *file = iocb->ki_filp;
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
		loff_t size;

		size = i_size_read(inode);
		if (iocb->ki_flags & IOCB_NOWAIT) {
			if (filemap_range_has_page(mapping, iocb->ki_pos,
						   iocb->ki_pos + count - 1))
				return -EAGAIN;
		} else {
			retval = filemap_write_and_wait_range(mapping,
						iocb->ki_pos,
					        iocb->ki_pos + count - 1);
			if (retval < 0)
				goto out;
		}

		file_accessed(file);

		retval = mapping->a_ops->direct_IO(iocb, iter);
		if (retval >= 0) {
			iocb->ki_pos += retval;
			count -= retval;
		}
		iov_iter_revert(iter, count - iov_iter_count(iter));

		if (retval < 0 || !count || iocb->ki_pos >= size ||
		    IS_DAX(inode))
			goto out;
	}
	
	/* 普通读取,大多数走这个函数, 这个函数带有页缓存*/
	retval = generic_file_buffered_read(iocb, iter, retval);
out:
	return retval;
}

ssize_t generic_file_buffered_read(struct kiocb *iocb,
		struct iov_iter *iter, ssize_t written)
{
	/* 文件指针 */
	struct file *filp = iocb->ki_filp;

	/* 映射函数表 */
	struct address_space *mapping = filp->f_mapping;

	/* inode */
	struct inode *inode = mapping->host;

	/* 预读上下文 */
	struct file_ra_state *ra = &filp->f_ra;

	/* 读的起点 */
	loff_t *ppos = &iocb->ki_pos;
	pgoff_t index;
	pgoff_t last_index;
	pgoff_t prev_index;
	unsigned long offset;      /* offset into pagecache page */
	unsigned int prev_offset;
	int error = 0;

	/* 如果读的起点超过了超级块最大字符数 ? */
	if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
		return 0;

	/* 将count限制为超级块规定的最大字节数*/
	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);

	/* 页面索引 */
	index = *ppos >> PAGE_SHIFT;
	/* 之前预读索引 */
	prev_index = ra->prev_pos >> PAGE_SHIFT;
	/* 预读偏移 */
	prev_offset = ra->prev_pos & (PAGE_SIZE-1);
	/* todo: 下次访问的页索引 */
	last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;

	/* 本次读取在页内的偏移 */
	offset = *ppos & ~PAGE_MASK;

	/* 原注释: 当我们已经复制了一些数据,就不能再安全的返回 -EIOCBQUEUED, 需要
		把文件标志改成IOCB_NOWAIT
	 */
	if (written && (iocb->ki_flags & IOCB_WAITQ))
		iocb->ki_flags |= IOCB_NOWAIT;

	/* 从这里就开始读取了 */
	for (;;) {
		struct page *page;
		pgoff_t end_index;
		loff_t isize;
		unsigned long nr, ret;

		/* 让出cpu调度其它进程,转为io操作可能要读磁盘,是个耗时操作 */
		cond_resched();
find_page:
		/* 如果有信号要处理,就退出,先处理信号 */
		if (fatal_signal_pending(current)) {
			error = -EINTR;
			goto out;
		}

		/* 从mapping里找目标页面 */
		page = find_get_page(mapping, index);
		if (!page) { // 目标页面还没有缓存

			// 用户不要做IO操作
			if (iocb->ki_flags & IOCB_NOIO)
				goto would_block;
			// 开始同步预读
			page_cache_sync_readahead(mapping,
					ra, filp,
					index, last_index - index);
			// 再找一次
			page = find_get_page(mapping, index);

			// 如果还没找到,就去no_cached_page处理,在这里面可能会从磁盘上读页面
			if (unlikely(page == NULL))
				goto no_cached_page;
		}
		if (PageReadahead(page)) { // 如果是预读的页面
			if (iocb->ki_flags & IOCB_NOIO) {
				put_page(page);
				goto out;
			}
			page_cache_async_readahead(mapping,
					ra, filp, page,
					index, last_index - index);
		}
		if (!PageUptodate(page)) { 
			/* 如果当前缓存的页面不是最新的 */

			if (iocb->ki_flags & IOCB_WAITQ) {
				if (written) {
					put_page(page);
					goto out;
				}
				error = wait_on_page_locked_async(page,
								iocb->ki_waitq);
			} else {
				if (iocb->ki_flags & IOCB_NOWAIT) {
					put_page(page);
					goto would_block;
				}
				error = wait_on_page_locked_killable(page);
			}
			if (unlikely(error))
				goto readpage_error;
			if (PageUptodate(page))
				goto page_ok;

			if (inode->i_blkbits == PAGE_SHIFT ||
					!mapping->a_ops->is_partially_uptodate)
				goto page_not_up_to_date;
			/* pipes can't handle partially uptodate pages */
			if (unlikely(iov_iter_is_pipe(iter)))
				goto page_not_up_to_date;
			if (!trylock_page(page))
				goto page_not_up_to_date;
			/* Did it get truncated before we got the lock? */
			if (!page->mapping)
				goto page_not_up_to_date_locked;
			if (!mapping->a_ops->is_partially_uptodate(page,
							offset, iter->count))
				goto page_not_up_to_date_locked;
			unlock_page(page);
		}
page_ok:
		/* 文件大小,
			文件大小必须要等页更新了之后再获取*/
		isize = i_size_read(inode);

		/* 数据最后一页 */
		end_index = (isize - 1) >> PAGE_SHIFT;

		/* 如果文件为0, 或者要读的页超过了文件大小,则退出 */
		if (unlikely(!isize || index > end_index)) {
			put_page(page);
			goto out;
		}

		/* 一次最大复制的数据量为1页, 因为是数据是页缓存的 */
		nr = PAGE_SIZE;

		/* 如果读的是最后一页,则要修改最大可读数据量 */
		if (index == end_index) {

			/* 这句相当于: nr = isize % PAGE_SIZE,
				就是最后一页可读的数据量 */
			nr = ((isize - 1) & ~PAGE_MASK) + 1;

			/* 偏移超出了可读的数据,肯定是错了 */
			if (nr <= offset) {
				put_page(page);
				goto out;
			}
		}

		/* 减去offset就是本次可读的数据量 */
		nr = nr - offset;

		/* 如果用户可以写入此页,需要刷新dcache ? */
		if (mapping_writably_mapped(mapping))
			flush_dcache_page(page);

		/* 标记此页已访问 */
		if (prev_index != index || offset != prev_offset)
			mark_page_accessed(page);
		prev_index = index;

		/* 向用户空间复制数据*/
		ret = copy_page_to_iter(page, offset, nr, iter);

		/* 增加offset */
		offset += ret;

		/* 计算下次要读的页索引, offset+ret可能会到下一页,
		相当于: index += offset / PAGE_SIZE */
		index += offset >> PAGE_SHIFT;

		/* 计算下次读的偏移: 相当于 offset %= PAGE_SIZE */
		offset &= ~PAGE_MASK;
		prev_offset = offset;

		/* 减少页引用计数 */
		put_page(page);

		/* 统计已读取的数据,
			这里之所以用written是因为读是向用户的缓冲区里写 */
		written += ret;

		/* 如果已经读够了,就退出, iter的count在copy_page_to_iter里被修改*/
		if (!iov_iter_count(iter))
			goto out;
		/* 如果读出的数据小于应该读的数据,则退出 */
		if (ret < nr) {
			error = -EFAULT;
			goto out;
		}
		continue;

page_not_up_to_date:
		// 页面不是最新的,锁定页面
		if (iocb->ki_flags & IOCB_WAITQ) {
			if (written) {
				put_page(page);
				goto out;
			}
			error = lock_page_async(page, iocb->ki_waitq);
		} else {
			error = lock_page_killable(page);
		}
		if (unlikely(error))
			goto readpage_error;

page_not_up_to_date_locked:
		// 如果页面没有映射,重新再找一下
		if (!page->mapping) {
			unlock_page(page);
			put_page(page);
			continue;
		}

		// 如果内容是最新的,再去page_ok重新读页面
		if (PageUptodate(page)) {
			unlock_page(page);
			goto page_ok;
		}

readpage:
		// 如果用户要求读的时候不做IO操作,或者不等待,则退出
		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
			unlock_page(page);
			put_page(page);
			goto would_block;
		}
		// 清除页面的错误标记
		ClearPageError(page);
		// 调用文件系统的读页面函数
		error = mapping->a_ops->readpage(filp, page);

		// 如果出错了,则退出
		if (unlikely(error)) {

			// todo: AOP_TRUNCATED_PAGE是啥?
			if (error == AOP_TRUNCATED_PAGE) {
				put_page(page);
				error = 0;
				goto find_page;
			}
			goto readpage_error;
		}

		// 如果页不是最新的,则要等待更新
		if (!PageUptodate(page)) {

			// 锁定页面
			if (iocb->ki_flags & IOCB_WAITQ) {
				if (written) {
					put_page(page);
					goto out;
				}
				error = lock_page_async(page, iocb->ki_waitq);
			} else {
				error = lock_page_killable(page);
			}
			// 锁页面出错
			if (unlikely(error))
				goto readpage_error;
			
			// 再判断一下是不是最新的,因为上锁的过程中可能会阻塞,进程会被调度出去
			if (!PageUptodate(page)) {
				
				// todo: 页面没有映射,再读一次??
				if (page->mapping == NULL) {
					unlock_page(page);
					put_page(page);
					goto find_page;
				}

				unlock_page(page);
				
				// 把预读页面减少4倍
				shrink_readahead_size_eio(ra);
				error = -EIO;
				goto readpage_error;
			}
			unlock_page(page);
		}

		// 读页面成功,再去page_ok,读一次
		goto page_ok;

readpage_error:
		/* UHHUH! A synchronous read error occurred. Report it */
		put_page(page);
		goto out;

no_cached_page:
		
		// 申请一页内存
		page = page_cache_alloc(mapping);
		if (!page) {
			error = -ENOMEM;
			goto out;
		}

		// 把新申请的页加到页缓存中
		error = add_to_page_cache_lru(page, mapping, index,
				mapping_gfp_constraint(mapping, GFP_KERNEL));

		// 如果出错了就释放页
		if (error) {
			put_page(page);
			if (error == -EEXIST) {
				error = 0;
				goto find_page;
			}
			goto out;
		}

		// 去读页面, 这个代码写的真是绕,不知道为啥这样写
		goto readpage;
	}

would_block:
	error = -EAGAIN;
out:
	// 更新预读指针
	ra->prev_pos = prev_index;
	ra->prev_pos <<= PAGE_SHIFT;
	ra->prev_pos |= prev_offset;

	// 增加iov_iter的走点指针
	*ppos = ((loff_t)index << PAGE_SHIFT) + offset;
	// 更新文件的atime
	file_accessed(filp);
	// 返回已写入的数据量
	return written ? written : error;
}
// find_get_page 直接调用 pagecache_get_page(mapping, offset, 0, 0)
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
		int fgp_flags, gfp_t gfp_mask)
{
	struct page *page;

repeat:
	// 找目标页,这里面主要是调用xa的接口来查找页
	page = find_get_entry(mapping, index);
	// xa_is_value判断是指针还是值
	// 在这里如果是值就是没找到
	if (xa_is_value(page))
		page = NULL;

	if (!page)
		goto no_page;

	// 走到这里就是找到页了

	// 判断是否要加锁
	if (fgp_flags & FGP_LOCK) {
		if (fgp_flags & FGP_NOWAIT) {
			if (!trylock_page(page)) {
				put_page(page);
				return NULL;
			}
		} else {
			lock_page(page);
		}

		/* Has the page been truncated? */
		if (unlikely(page->mapping != mapping)) {
			unlock_page(page);
			put_page(page);
			goto repeat;
		}
		VM_BUG_ON_PAGE(!thp_contains(page, index), page);
	}

	// 判断已访问, 写标志
	if (fgp_flags & FGP_ACCESSED)
		mark_page_accessed(page);
	else if (fgp_flags & FGP_WRITE) {
		/* Clear idle flag for buffer write */
		if (page_is_idle(page))
			clear_page_idle(page);
	}
	// todo: 找子页?
	if (!(fgp_flags & FGP_HEAD))
		page = find_subpage(page, index);

no_page:
	// 如果没找到页,而且需要创建
	if (!page && (fgp_flags & FGP_CREAT)) {
		int err;
		if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
			gfp_mask |= __GFP_WRITE;
		if (fgp_flags & FGP_NOFS)
			gfp_mask &= ~__GFP_FS;

		// 申请一页
		page = __page_cache_alloc(gfp_mask);
		if (!page)
			return NULL;

		if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
			fgp_flags |= FGP_LOCK;

		/* Init accessed so avoid atomic mark_page_accessed later */
		if (fgp_flags & FGP_ACCESSED)
			__SetPageReferenced(page);

		// 加入lru中
		err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
		if (unlikely(err)) {
			put_page(page);
			page = NULL;
			if (err == -EEXIST)
				goto repeat;
		}

		/*
		 * add_to_page_cache_lru locks the page, and for mmap we expect
		 * an unlocked page.
		 */
		if (page && (fgp_flags & FGP_FOR_MMAP))
			unlock_page(page);
	}

	return page;
}

int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
				pgoff_t offset, gfp_t gfp_mask)
{
	void *shadow = NULL;
	int ret;

	// 设置页的锁标志
	__SetPageLocked(page);

	// 加入缓存
	ret = __add_to_page_cache_locked(page, mapping, offset,
					 gfp_mask, &shadow);
	if (unlikely(ret))
		// 添加出错清除锁标志
		__ClearPageLocked(page);
	else {
		WARN_ON_ONCE(PageActive(page));
		if (!(gfp_mask & __GFP_WRITE) && shadow)
			workingset_refault(page, shadow);
		// 添加到lru里
		lru_cache_add(page);
	}
	return ret;
}

noinline int __add_to_page_cache_locked(struct page *page,
					struct address_space *mapping,
					pgoff_t offset, gfp_t gfp,
					void **shadowp)
{
	// 初始化xarray
	XA_STATE(xas, &mapping->i_pages, offset);

	// 是否是大页
	int huge = PageHuge(page);
	int error;
	bool charged = false;

	VM_BUG_ON_PAGE(!PageLocked(page), page);
	VM_BUG_ON_PAGE(PageSwapBacked(page), page);
	mapping_set_update(&xas, mapping);

	// 设置页的mapping, index信息
	get_page(page);
	page->mapping = mapping;
	page->index = offset;

	if (!huge) {
		// cgroup相关
		error = mem_cgroup_charge(page, current->mm, gfp);
		if (error)
			goto error;
		charged = true;
	}

	gfp &= GFP_RECLAIM_MASK;

	// 把 page插入 address_space的xarray中
	// todo: xarray没仔细研究
	do {
		unsigned int order = xa_get_order(xas.xa, xas.xa_index);
		void *entry, *old = NULL;

		if (order > thp_order(page))
			xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
					order, gfp);
		xas_lock_irq(&xas);

		// 查找是否有重复的?
		xas_for_each_conflict(&xas, entry) {
			old = entry;
			if (!xa_is_value(entry)) {
				xas_set_err(&xas, -EEXIST);
				goto unlock;
			}
		}

		if (old) {
			if (shadowp)
				*shadowp = old;
			/* entry may have been split before we acquired lock */
			order = xa_get_order(xas.xa, xas.xa_index);
			if (order > thp_order(page)) {
				xas_split(&xas, old, order);
				xas_reset(&xas);
			}
		}

		// 保存页
		xas_store(&xas, page);
		if (xas_error(&xas))
			goto unlock;

		if (old)
			mapping->nrexceptional--;
		// 增加page计数
		mapping->nrpages++;

		if (!huge)
			__inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
		xas_unlock_irq(&xas);
	} while (xas_nomem(&xas, gfp));

	if (xas_error(&xas)) {
		error = xas_error(&xas);
		if (charged)
			mem_cgroup_uncharge(page);
		goto error;
	}

	trace_mm_filemap_add_to_page_cache(page);
	return 0;
error:
	page->mapping = NULL;
	/* Leave page->index set: truncation relies upon it */
	put_page(page);
	return error;
}

void lru_cache_add(struct page *page)
{
	// 一个pagevec可能保存15个页
	struct pagevec *pvec;

	VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
	VM_BUG_ON_PAGE(PageLRU(page), page);

	get_page(page);

	/*
	lru_pvecs里面存放了各个页的列表
	struct lru_pvecs {
		local_lock_t lock;
		struct pagevec lru_add; // lru列表
		struct pagevec lru_deactivate_file; // 不活跃的文件页
		struct pagevec lru_deactivate; // 不活跃
		struct pagevec lru_lazyfree; // 延迟释放
	#ifdef CONFIG_SMP
		struct pagevec activate_page; // 活跃页
	#endif
	};
	*/
	local_lock(&lru_pvecs.lock);
	pvec = this_cpu_ptr(&lru_pvecs.lru_add);

	// pagevec_add把页添加到pvec里,返回值是pvec的剩余容量
	if (!pagevec_add(pvec, page) || PageCompound(page))
		// 如果pvec没有容量了,
		__pagevec_lru_add(pvec);
	local_unlock(&lru_pvecs.lock);
}

void __pagevec_lru_add(struct pagevec *pvec)
{
	pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}

static void pagevec_lru_move_fn(struct pagevec *pvec,
	void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
	void *arg)
{
	int i;
	struct pglist_data *pgdat = NULL;
	struct lruvec *lruvec;
	unsigned long flags = 0;

	// 遍历 pvec
	for (i = 0; i < pagevec_count(pvec); i++) {
		struct page *page = pvec->pages[i];

		// 页对应的 numa节点
		struct pglist_data *pagepgdat = page_pgdat(page);

		// 记录并锁住numa节点
		if (pagepgdat != pgdat) {
			if (pgdat)
				spin_unlock_irqrestore(&pgdat->lru_lock, flags);
			pgdat = pagepgdat;
			spin_lock_irqsave(&pgdat->lru_lock, flags);
		}

		lruvec = mem_cgroup_page_lruvec(page, pgdat);

		// 调用函数转移page
		(*move_fn)(page, lruvec, arg);
	}
	if (pgdat)
		spin_unlock_irqrestore(&pgdat->lru_lock, flags);
	release_pages(pvec->pages, pvec->nr);
	pagevec_reinit(pvec);
}

static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
				 void *arg)
{
	/*
	enum lru_list {
		LRU_INACTIVE_ANON = LRU_BASE,
		LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
		LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
		LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
		LRU_UNEVICTABLE,
		NR_LRU_LISTS
	};

	*/
	enum lru_list lru;
	int was_unevictable = TestClearPageUnevictable(page);
	int nr_pages = thp_nr_pages(page);

	VM_BUG_ON_PAGE(PageLRU(page), page);

	// 设置page的lru标志
	SetPageLRU(page);
	smp_mb__after_atomic();

	// 根据page是不是evictable,然后选择合适的lru列表
	if (page_evictable(page)) {
		lru = page_lru(page);
		if (was_unevictable)
			__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
	} else {
		lru = LRU_UNEVICTABLE;
		ClearPageActive(page);
		SetPageUnevictable(page);
		if (!was_unevictable)
			__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
	}

	// 添加到节点的lru列表中
	add_page_to_lru_list(page, lruvec, lru);
	trace_mm_lru_insertion(page, lru);
}

size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
			 struct iov_iter *i)
{
	// 检查复制的各种参数是否合法
	if (unlikely(!page_copy_sane(page, offset, bytes)))
		return 0;
	if (i->type & (ITER_BVEC|ITER_KVEC)) { // 内核调用的读
		void *kaddr = kmap_atomic(page);
		size_t wanted = copy_to_iter(kaddr + offset, bytes, i);
		kunmap_atomic(kaddr);
		return wanted;
	} else if (unlikely(iov_iter_is_discard(i))) { // 读操作取消
		if (unlikely(i->count < bytes))
			bytes = i->count;
		i->count -= bytes;
		return bytes;
	} else if (likely(!iov_iter_is_pipe(i))) // 一般的读
		// 这个函数向iov里的buf复制数据,并修改iov-count, 等相关变量
		return copy_page_to_iter_iovec(page, offset, bytes, i);
	else // pipe的读
		return copy_page_to_iter_pipe(page, offset, bytes, i);
}

预读没太搞明白,后面再看。。。

static inline
void page_cache_sync_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *file, pgoff_t index,
		unsigned long req_count)
{
	// index是本次需要读的页面,req_count是需要预读的页数
	// 定义一个预读控制结构
	DEFINE_READAHEAD(ractl, file, mapping, index);

	// 同步预读
	page_cache_sync_ra(&ractl, ra, req_count);
}

void page_cache_sync_ra(struct readahead_control *ractl,
		struct file_ra_state *ra, unsigned long req_count)
{
	// 强制预读
	// todo: 随机存储不是应该关闭预读吗?
	bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);

	// 如果没有预读,就强制预读
	if (!ra->ra_pages || blk_cgroup_congested()) {
		if (!ractl->file)
			return;
		req_count = 1;
		do_forced_ra = true;
	}

	/* be dumb */
	if (do_forced_ra) { // 强制预读
		force_page_cache_ra(ractl, ra, req_count);
		return;
	}

	// 一般预读
	ondemand_readahead(ractl, ra, false, req_count);
}

void force_page_cache_ra(struct readahead_control *ractl,
		struct file_ra_state *ra, unsigned long nr_to_read)
{
	struct address_space *mapping = ractl->mapping;
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
	unsigned long max_pages, index;

	// 预读肯定要有相关的a_ops
	if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
			!mapping->a_ops->readahead))
		return;

	index = readahead_index(ractl);
	// 这句啥意思
	max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
	nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
	while (nr_to_read) {
		// 一次预读2M的数据,再换算成页数
		unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;

		if (this_chunk > nr_to_read)
			this_chunk = nr_to_read;
		ractl->_index = index;
		do_page_cache_ra(ractl, this_chunk, 0);

		index += this_chunk;
		nr_to_read -= this_chunk;
	}
}


void do_page_cache_ra(struct readahead_control *ractl,
		unsigned long nr_to_read, unsigned long lookahead_size)
{
	struct inode *inode = ractl->mapping->host;
	unsigned long index = readahead_index(ractl);

	// 文件大小
	loff_t isize = i_size_read(inode);
	pgoff_t end_index;	/* The last page we want to read */

	if (isize == 0)
		return;

	// 最大能读的页
	end_index = (isize - 1) >> PAGE_SHIFT;
	if (index > end_index)
		return;
	// 限制最大读的页数
	if (nr_to_read > end_index - index)
		nr_to_read = end_index - index + 1;

	// 主要调用文件系统的readahead函数来预读页面
	page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
}

static void ondemand_readahead(struct readahead_control *ractl,
		struct file_ra_state *ra, bool hit_readahead_marker,
		unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	unsigned long index = readahead_index(ractl);
	pgoff_t prev_index;

	// 如果是刚开始读,则初始化预读
	if (!index)
		goto initial_readahead;

	/*
	 * It's the expected callback index, assume sequential access.
	 * Ramp up sizes, and push forward the readahead window.
	 */
	if ((index == (ra->start + ra->size - ra->async_size) ||
	     index == (ra->start + ra->size))) {
		ra->start += ra->size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
	if (hit_readahead_marker) {
		pgoff_t start;

		rcu_read_lock();
		start = page_cache_next_miss(ractl->mapping, index + 1,
				max_pages);
		rcu_read_unlock();

		if (!start || start - index > max_pages)
			return;

		ra->start = start;
		ra->size = start - index;	/* old async_size */
		ra->size += req_size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * oversize read
	 */
	if (req_size > max_pages)
		goto initial_readahead;

	/*
	 * sequential cache miss
	 * trivial case: (index - prev_index) == 1
	 * unaligned reads: (index - prev_index) == 0
	 */
	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
	if (index - prev_index <= 1UL)
		goto initial_readahead;

	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	if (try_context_readahead(ractl->mapping, ra, index, req_size,
			max_pages))
		goto readit;

	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
	do_page_cache_ra(ractl, req_size, 0);
	return;

initial_readahead:
	ra->start = index;
	ra->size = get_init_ra_size(req_size, max_pages);

	// 可以异步读的数据量
	// 因为在预读的时候,如果需要的页面已经读到了,则可以立即返回,不用等所有的页面读取后才返回
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
	 * Take care of maximum IO pages as above.
	 */
	if (index == ra->start && ra->size == ra->async_size) {
		add_pages = get_next_ra_size(ra, max_pages);
		if (ra->size + add_pages <= max_pages) {
			ra->async_size = add_pages;
			ra->size += add_pages;
		} else {
			ra->size = max_pages;
			ra->async_size = max_pages >> 1;
		}
	}

	ractl->_index = ra->start;
	do_page_cache_ra(ractl, ra->size, ra->async_size);
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值