文件系统和裸块设备的page cache问题

最新推荐文章于 2023-08-19 19:10:52 发布

sunshineywz

最新推荐文章于 2023-08-19 19:10:52 发布

阅读量274

点赞数

分类专栏：内核文章标签：文件系统和裸块设备的page cache问题

本文链接：https://blog.csdn.net/sunshineywz/article/details/102764380

版权

内核专栏收录该内容

43 篇文章 0 订阅

订阅专栏

文件系统读取文件一般会使用do_generic_file_read()，mapping指向普通文件的address space。如果一个文件的某一块不在page cache中，在find_get_page函数中会创建一个page，并将这个page根据index插入到这个普通文件的address space中。

static ssize_t do_generic_file_read(struct file *filp, loff_t *ppos,
        struct iov_iter *iter, ssize_t written)
{
    struct address_space *mapping = filp->f_mapping;
    struct inode *inode = mapping->host;
    struct file_ra_state *ra = &filp->f_ra;
    pgoff_t index;
    pgoff_t last_index;
    pgoff_t prev_index;
    unsigned long offset;      /* offset into pagecache page */
    unsigned int prev_offset;
    int error = 0;
 
    index = *ppos >> PAGE_CACHE_SHIFT;
    prev_index = ra->prev_pos >> PAGE_CACHE_SHIFT;
    prev_offset = ra->prev_pos & (PAGE_CACHE_SIZE-1);
    last_index = (*ppos + iter->count + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT;
    offset = *ppos & ~PAGE_CACHE_MASK;
 
    for (;;) {
        struct page *page;
        pgoff_t end_index;
        loff_t isize;
        unsigned long nr, ret;
 
        cond_resched();
find_page:
        page = find_get_page(mapping, index);
        if (!page) {
            page_cache_sync_readahead(mapping,
                    ra, filp,
                    index, last_index - index);
            page = find_get_page(mapping, index);
            if (unlikely(page == NULL))
                goto no_cached_page;
        }
       ......//此处省略约200行
}

块设备的address space

但是在读取文件系统元数据的时候，元数据对应的page会被加入到底层裸块设备的address space中。下面代码的bdev_mapping指向块设备的address space，调用find_get_page_flags()后，一个新的page（如果page不在这个块设备的address space）就被创建并且插入到这个块设备的address space。

static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
{
    struct inode *bd_inode = bdev->bd_inode;
    struct address_space *bd_mapping = bd_inode->i_mapping;
    struct buffer_head *ret = NULL;
    pgoff_t index;
    struct buffer_head *bh;
    struct buffer_head *head;
    struct page *page;
    int all_mapped = 1;
 
    index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
    page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
    if (!page)
        goto out;
    ......//此处省略几十行
}

find_get_page_flags

static inline struct page *find_get_page_flags(struct address_space *mapping,
					pgoff_t offset, int fgp_flags)
{
	return pagecache_get_page(mapping, offset, fgp_flags, 0);
}

pagecache_get_page

/**
 * pagecache_get_page - find and get a page reference
 * @mapping: the address_space to search
 * @offset: the page index
 * @fgp_flags: PCG flags
 * @gfp_mask: gfp mask to use for the page cache data page allocation
 *
 * Looks up the page cache slot at @mapping & @offset.
 *
 * PCG flags modify how the page is returned.
 *
 * @fgp_flags can be:
 *
 * - FGP_ACCESSED: the page will be marked accessed
 * - FGP_LOCK: Page is return locked
 * - FGP_CREAT: If page is not present then a new page is allocated using
 *   @gfp_mask and added to the page cache and the VM's LRU
 *   list. The page is returned locked and with an increased
 *   refcount. Otherwise, NULL is returned.
 *
 * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
 * if the GFP flags specified for FGP_CREAT are atomic.
 *
 * If there is a page cache page, it is returned with an increased refcount.
 */
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
	int fgp_flags, gfp_t gfp_mask)
{
	struct page *page;
repeat:
	page = find_get_entry(mapping, offset);
	if (radix_tree_exceptional_entry(page))
		page = NULL;
	if (!page)
		goto no_page;
	if (fgp_flags & FGP_LOCK) {
		if (fgp_flags & FGP_NOWAIT) {
			if (!trylock_page(page)) {
				put_page(page);
				return NULL;
			}
		} else {
			lock_page(page);
		}
		/* Has the page been truncated? */
		if (unlikely(page->mapping != mapping)) {
			unlock_page(page);
			put_page(page);
			goto repeat;
		}
		VM_BUG_ON_PAGE(page->index != offset, page);
	}
	if (page && (fgp_flags & FGP_ACCESSED))
		mark_page_accessed(page);
no_page:
	if (!page && (fgp_flags & FGP_CREAT)) {
		int err;
		if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
			gfp_mask |= __GFP_WRITE;
		if (fgp_flags & FGP_NOFS)
			gfp_mask &= ~__GFP_FS;
		page = __page_cache_alloc(gfp_mask);
		if (!page)
			return NULL;
		if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
			fgp_flags |= FGP_LOCK;
		/* Init accessed so avoid atomic mark_page_accessed later */
		if (fgp_flags & FGP_ACCESSED)
			__SetPageReferenced(page);
		err = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
		if (unlikely(err)) {
			put_page(page);
			page = NULL;
			if (err == -EEXIST)
				goto repeat;
		}
	}
	return page;
}

find_get_entry

/**
 * find_get_entry - find and get a page cache entry
 * @mapping: the address_space to search
 * @offset: the page cache index
 *
 * Looks up the page cache slot at @mapping & @offset.  If there is a
 * page cache page, it is returned with an increased refcount.
 *
 * If the slot holds a shadow entry of a previously evicted page, or a
 * swap entry from shmem/tmpfs, it is returned.
 *
 * Otherwise, %NULL is returned.
 */
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
	void **pagep;
	struct page *head, *page;
	rcu_read_lock();
repeat:
	page = NULL;
	pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
	if (pagep) {
		page = radix_tree_deref_slot(pagep);
		if (unlikely(!page))
			goto out;
		if (radix_tree_exception(page)) {
			if (radix_tree_deref_retry(page))
				goto repeat;
			/*
			 * A shadow entry of a recently evicted page,
			 * or a swap entry from shmem/tmpfs.  Return
			 * it without attempting to raise page count.
			 */
			goto out;
		}
		head = compound_head(page);
		if (!page_cache_get_speculative(head))
			goto repeat;
		/* The page was split under us? */
		if (compound_head(page) != head) {
			put_page(head);
			goto repeat;
		}
		/*
		 * Has the page moved?
		 * This is part of the lockless pagecache protocol. See
		 * include/linux/pagemap.h for details.
		 */
		if (unlikely(page != *pagep)) {
			put_page(head);
			goto repeat;
		}
	}
out:
	rcu_read_unlock();
	return page;
}

两份缓存

前面提到的情况是正常的操作流程，属于普通文件的page放在文件的address space，元数据对应的page放在块设备的address space中，大家井水不犯河水，和平共处。但是世事难料，总有一些不按套路出牌的家伙。文件系统在块设备上欢快的跑着，如果有人绕过文件系统，直接去操作块设备上属于文件的数据块，这会出现什么情况？如果这个数据块已经在普通文件的address space中，这次直接的数据块修改能够立马体现到普通文件的缓存中吗？
答案是直接修改块设备上块会新建一个对应这个块的page，并且这个page会被加到块设备的address space中。也就是同一个数据块，在其所属的普通文件的address space中有一个对应的page。同时，在这个块设备的address space中也会有一个与其对应的page，所有的修改都更新到这个块设备address space中的page上。除非重新从磁盘上读取这一块的数据，否则普通文件的文件缓存并不会感知这一修改。

总结

普通文件的数据可以保存在它的地址空间中，同时直接访问块设备中此文件的块，也会将这个文件的数据保存在块设备的地址空间中。这两份缓存相互独立，kernel并不会为这种非正常访问同步两份缓存，从而避免了同步的开销。

sunshineywz

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
文件系统和裸块设备的page cache问题

文件系统读取文件一般会使用do_generic_file_read()，mapping指向普通文件的address space。如果一个文件的某一块不在page cache中，在find_get_page函数中会创建一个page，并将这个page根据index插入到这个普通文件的address space中。static ssize_t do_generic_file_read(struct f...
复制链接

扫一扫

专栏目录