linux-5.14.2-内核文件读取page缓存

内核读取页缓存

内核read文件一路跟踪最终会到vfs_read函数

ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_READ))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_READ))
		return -EINVAL;
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;

	ret = rw_verify_area(READ, file, pos, count);
	if (ret)
		return ret;
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;

	if (file->f_op->read)
		ret = file->f_op->read(file, buf, count, pos);
	else if (file->f_op->read_iter)
		ret = new_sync_read(file, buf, count, pos);
	else
		ret = -EINVAL;
	if (ret > 0) {
		fsnotify_access(file);
		add_rchar(current, ret);
	}
	inc_syscr(current);
	return ret;
}


static ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
{
	struct iovec iov = { .iov_base = buf, .iov_len = len };
	struct kiocb kiocb;
	struct iov_iter iter;
	ssize_t ret;

	init_sync_kiocb(&kiocb, filp);
	kiocb.ki_pos = (ppos ? *ppos : 0);
	iov_iter_init(&iter, READ, &iov, 1, len);

	ret = call_read_iter(filp, &kiocb, &iter);	//跳转到文件系统的操作函数如ext2的ext2_file_read_iter
	BUG_ON(ret == -EIOCBQUEUED);
	if (ppos)
		*ppos = kiocb.ki_pos;
	return ret;
}

以ext2文件系统的回调函数ext2_file_read_iter为例

static ssize_t ext2_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
#ifdef CONFIG_FS_DAX
	if (IS_DAX(iocb->ki_filp->f_mapping->host))
		return ext2_dax_read_iter(iocb, to);
#endif
	return generic_file_read_iter(iocb, to);
}
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
	size_t count = iov_iter_count(iter);
	ssize_t retval = 0;

	if (!count)
		return 0; /* skip atime */

	if (iocb->ki_flags & IOCB_DIRECT) {//直接从磁盘读
		struct file *file = iocb->ki_filp;
		struct address_space *mapping = file->f_mapping;
		struct inode *inode = mapping->host;
		loff_t size;

		size = i_size_read(inode);
		if (iocb->ki_flags & IOCB_NOWAIT) {
			if (filemap_range_needs_writeback(mapping, iocb->ki_pos,
						iocb->ki_pos + count - 1))
				return -EAGAIN;
		} else {
			retval = filemap_write_and_wait_range(mapping,
						iocb->ki_pos,
					        iocb->ki_pos + count - 1);
			if (retval < 0)
				return retval;
		}

		file_accessed(file);

		retval = mapping->a_ops->direct_IO(iocb, iter);
		if (retval >= 0) {
			iocb->ki_pos += retval;
			count -= retval;
		}
		if (retval != -EIOCBQUEUED)
			iov_iter_revert(iter, count - iov_iter_count(iter));

		/*
		 * Btrfs can have a short DIO read if we encounter
		 * compressed extents, so if there was an error, or if
		 * we've already read everything we wanted to, or if
		 * there was a short read because we hit EOF, go ahead
		 * and return.  Otherwise fallthrough to buffered io for
		 * the rest of the read.  Buffered reads will not work for
		 * DAX files, so don't bother trying.
		 */
		if (retval < 0 || !count || iocb->ki_pos >= size ||
		    IS_DAX(inode))
			return retval;
	}

	return filemap_read(iocb, iter, retval);
}

ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
		ssize_t already_read)
{
	struct file *filp = iocb->ki_filp;
	struct file_ra_state *ra = &filp->f_ra;
	struct address_space *mapping = filp->f_mapping;
	struct inode *inode = mapping->host;
	struct pagevec pvec;
	int i, error = 0;
	bool writably_mapped;
	loff_t isize, end_offset;

	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
		return 0;
	if (unlikely(!iov_iter_count(iter)))
		return 0;

	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
	pagevec_init(&pvec);

	do {
		cond_resched();

		/*
		 * If we've already successfully copied some data, then we
		 * can no longer safely return -EIOCBQUEUED. Hence mark
		 * an async read NOWAIT at that point.
		 */
		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
			iocb->ki_flags |= IOCB_NOWAIT;

		error = filemap_get_pages(iocb, iter, &pvec);	//读取一批页缓存到pvec结构体中
		if (error < 0)
			break;

		/*
		 * i_size must be checked after we know the pages are Uptodate.
		 *
		 * Checking i_size after the check allows us to calculate
		 * the correct value for "nr", which means the zero-filled
		 * part of the page is not copied back to userspace (unless
		 * another truncate extends the file - this is desired though).
		 */
		isize = i_size_read(inode);	//文件大小
		if (unlikely(iocb->ki_pos >= isize))
			goto put_pages;
		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);	//停止读的最终位置,相对于文件的起始位置

		/*
		 * Once we start copying data, we don't want to be touching any
		 * cachelines that might be contended:
		 */
		writably_mapped = mapping_writably_mapped(mapping);

		/*
		 * When a sequential read accesses a page several times, only
		 * mark it as accessed the first time.
		 */
		if (iocb->ki_pos >> PAGE_SHIFT !=
		    ra->prev_pos >> PAGE_SHIFT)
			mark_page_accessed(pvec.pages[0]);

		for (i = 0; i < pagevec_count(&pvec); i++) {
			struct page *page = pvec.pages[i];
			size_t page_size = thp_size(page);	//一个页的大小一般为4k
			size_t offset = iocb->ki_pos & (page_size - 1);//偏移位置
			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
					     page_size - offset);
			size_t copied;

			if (end_offset < page_offset(page))	//结束位置小于页在文件中的位置,复制结束
				break;
			if (i > 0)
				mark_page_accessed(page);
			/*
			 * If users can be writing to this page using arbitrary
			 * virtual addresses, take care about potential aliasing
			 * before reading the page on the kernel side.
			 */
			if (writably_mapped) {
				int j;

				for (j = 0; j < thp_nr_pages(page); j++)
					flush_dcache_page(page + j);
			}

			copied = copy_page_to_iter(page, offset, bytes, iter);	//已复制的字节数

			already_read += copied;//表示已读取的字节数
			iocb->ki_pos += copied;	//文件偏移位置加上已读取文件的字节数
			ra->prev_pos = iocb->ki_pos;

			if (copied < bytes) {
				error = -EFAULT;
				break;
			}
		}
put_pages:
		for (i = 0; i < pagevec_count(&pvec); i++)
			put_page(pvec.pages[i]);
		pagevec_reinit(&pvec);
	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);

	file_accessed(filp);

	return already_read ? already_read : error;
}

其中关键函数为filemap_get_pages函数从inode指向的地址空间复制一批页缓存读取出来。

static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
		struct pagevec *pvec)
{
	struct file *filp = iocb->ki_filp;
	struct address_space *mapping = filp->f_mapping;
	struct file_ra_state *ra = &filp->f_ra;
	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;	//页的起始位置
	pgoff_t last_index;	//页的结束位置
	struct page *page;
	int err = 0;

	last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);	//最终位置的页编号last_index
retry:
	if (fatal_signal_pending(current))
		return -EINTR;

	filemap_get_read_batch(mapping, index, last_index, pvec);	//读取一批页缓存到pvec结构体中
	if (!pagevec_count(pvec)) {
		if (iocb->ki_flags & IOCB_NOIO)
			return -EAGAIN;
		page_cache_sync_readahead(mapping, ra, filp, index,
				last_index - index);
		filemap_get_read_batch(mapping, index, last_index, pvec);
	}
	if (!pagevec_count(pvec)) {
		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
			return -EAGAIN;
		err = filemap_create_page(filp, mapping,
				iocb->ki_pos >> PAGE_SHIFT, pvec);
		if (err == AOP_TRUNCATED_PAGE)
			goto retry;
		return err;
	}

	page = pvec->pages[pagevec_count(pvec) - 1];
	if (PageReadahead(page)) {
		err = filemap_readahead(iocb, filp, mapping, page, last_index);
		if (err)
			goto err;
	}
	if (!PageUptodate(page)) {
		if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
			iocb->ki_flags |= IOCB_NOWAIT;
		err = filemap_update_page(iocb, mapping, iter, page);
		if (err)
			goto err;
	}

	return 0;
err:
	if (err < 0)
		put_page(page);
	if (likely(--pvec->nr))
		return 0;
	if (err == AOP_TRUNCATED_PAGE)
		goto retry;
	return err;
}


static void filemap_get_read_batch(struct address_space *mapping,
		pgoff_t index, pgoff_t max, struct pagevec *pvec)
{
	XA_STATE(xas, &mapping->i_pages, index);
	struct page *head;

	rcu_read_lock();
	for (head = xas_load(&xas); head; head = xas_next(&xas)) {
		if (xas_retry(&xas, head))
			continue;
		if (xas.xa_index > max || xa_is_value(head))	//已读取的页位置大于需要读取的页的位置结束
			break;
		if (!page_cache_get_speculative(head))
			goto retry;

		/* Has the page moved or been split? */
		if (unlikely(head != xas_reload(&xas)))
			goto put_page;

		if (!pagevec_add(pvec, head))	//页加入到pvec数组中
			break;
		if (!PageUptodate(head))	//页标记置位
			break;
		if (PageReadahead(head))
			break;
		xas.xa_index = head->index + thp_nr_pages(head) - 1;
		xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
		continue;
put_page:
		put_page(head);
retry:
		xas_reset(&xas);
	}
	rcu_read_unlock();
}

其中for (head = xas_load(&xas); head; head = xas_next(&xas))表示从地址空间指向的基数树中读取页即从struct xarray结构体中读取出来。index表示其实页的标号,max表示最大页的标号。例如从一个文件中偏移位置为8k+5的位置开始读,总共读11k字节的内容,假设一个页大小为4k,则index为2,max为5。

基数树结构体

#ifndef XA_CHUNK_SHIFT
#define XA_CHUNK_SHIFT		(CONFIG_BASE_SMALL ? 4 : 6)
#endif
#define XA_CHUNK_SIZE		(1UL << XA_CHUNK_SHIFT)
#define XA_CHUNK_MASK		(XA_CHUNK_SIZE - 1)
#define XA_MAX_MARKS		3
#define XA_MARK_LONGS		DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG)


struct xarray {
	spinlock_t	xa_lock;
/* private: The rest of the data structure is not to be used directly. */
	gfp_t		xa_flags;
	void __rcu *	xa_head; //指向 struct xa_node *
};




struct xa_node {
	unsigned char	shift;		/* Bits remaining in each slot 每个槽中剩余的位*/ 	//当前节点在整个要查找的index的bit位中的位置。当RADIX_TREE_MAP_SHIFT为6时,每6个bit位为一层。shift表示当前层在第几个6bit位的位置
	unsigned char	offset;		/* Slot offset in parent 父节点槽位偏移*/	//当前节点在父节点中array的偏移位置
	unsigned char	count;		/* Total entry count 总条目数*/
	unsigned char	nr_values;	/* Value entry count 值条目数*/
	struct xa_node __rcu *parent;	/* NULL at top of tree */ //树顶
	struct xarray	*array;		/* The array we belong to */ //我们所属的数组
	union {
		struct list_head private_list;	/* For tree user */
		struct rcu_head	rcu_head;	/* Used when freeing node */
	};
	void __rcu	*slots[XA_CHUNK_SIZE];
	union {
		unsigned long	tags[XA_MAX_MARKS][XA_MARK_LONGS];
		unsigned long	marks[XA_MAX_MARKS][XA_MARK_LONGS];
	};
};

其中XA_CHUNK_SHIFT表示一层节点有多少位,例如XA_CHUNK_SHIFT表示4时。index表示的64位数字时总共有16层,一层表示4位。



static unsigned int get_offset(unsigned long index, struct xa_node *node)
{
	return (index >> node->shift) & XA_CHUNK_MASK;
}

static inline void *xa_entry(const struct xarray *xa,
				const struct xa_node *node, unsigned int offset)
{
	XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE);
	return rcu_dereference_check(node->slots[offset],
						lockdep_is_held(&xa->xa_lock));	//此一层的偏移
}

static void *xas_descend(struct xa_state *xas, struct xa_node *node)
{
	unsigned int offset = get_offset(xas->xa_index, node);
	void *entry = xa_entry(xas->xa, node, offset);

	xas->xa_node = node;
	if (xa_is_sibling(entry)) {
		offset = xa_to_sibling(entry);
		entry = xa_entry(xas->xa, node, offset);
	}

	xas->xa_offset = offset;
	return entry;
}

void *xas_load(struct xa_state *xas)
{
	void *entry = xas_start(xas);

	while (xa_is_node(entry)) {	//从最高位开始依次读取,每次移动XA_CHUNK_SHIFT位
		struct xa_node *node = xa_to_node(entry);

		if (xas->xa_shift > node->shift)
			break;
		entry = xas_descend(xas, node);
		if (node->shift == 0)	//当前node的偏移量为0表示最后一个节点即xa_index的最低XA_CHUNK_SHIFT位时
			break;
	}
	return entry;
}
static inline void *xas_next(struct xa_state *xas)
{
	struct xa_node *node = xas->xa_node;

	if (unlikely(xas_not_node(node) || node->shift ||
				xas->xa_offset == XA_CHUNK_MASK))
		return __xas_next(xas);

	xas->xa_index++;	//节点数+1
	xas->xa_offset++;	//此一层的偏移量+1
	return xa_entry(xas->xa, node, xas->xa_offset);
}

struct xarray的本质思想为,先将一个需要查找的页标号index化成二进制表示,struct xarray的每一层表示表示long index其中的XA_CHUNK_SHIFT位一层一层查找用安慰与的方式算出在每一层的偏移。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值