文件的预读操作分析

最新推荐文章于 2022-10-27 15:28:50 发布

Jack-changtao

最新推荐文章于 2022-10-27 15:28:50 发布

阅读量3.5k

点赞数

文章标签： cache struct file random list access

本文链接：https://blog.csdn.net/changtao381/article/details/5006233

版权

文件的预读操作分析

预读的两个优势：

1 ）每个 IO 一般涉及一组较大相邻的扇区，预读使磁盘控制器处理较少的命令。

2 ）提高系统的响应能力，顺序读取的文件一般不需要等待请求的数据

文件的顺序访问：如果请求的第一页紧跟上次访问所请求的最后一页，就认为是顺序的。

1. 主要的数据结构

* Track a single file's readahead state

struct file_ra_state {

unsigned long start; /* Current window */

unsigned long size;

unsigned long flags; /* ra flags RA_FLAG_xxx*/

unsigned long cache_hit; /* cache hit count*/

unsigned long prev_page; /* Cache last read() position */

unsigned long ahead_start; /* Ahead window */

unsigned long ahead_size;

unsigned long ra_pages; /* Maximum readahead window */

unsigned long mmap_hit; /* Cache hit stat for mmap accesses */

unsigned long mmap_miss; /* Cache miss stat for mmap accesses */

};

#define RA_FLAG_MISS 0x01 /* a cache miss occured against this file */

#define RA_FLAG_INCACHE 0x02 /* file is already in cache */

2. 主要流程

2.1. 系统调用：设置预取策略

asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)

{

return sys_fadvise64_64(fd, offset, len, advice);

}

* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could

* deactivate the pages and clear PG_Referenced.

asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)

{

struct file *file = fget(fd);

struct address_space *mapping;

struct backing_dev_info *bdi;

loff_t endbyte; /* inclusive */

pgoff_t start_index;

pgoff_t end_index;

unsigned long nrpages;

int ret = 0;

if (!file)

return -EBADF;

if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {

ret = -ESPIPE;

goto out;

}

mapping = file->f_mapping;

if (!mapping || len < 0) {

ret = -EINVAL;

goto out;

}

if (mapping->a_ops->get_xip_page)

/* no bad return value, but ignore advice */

goto out;

/* Careful about overflows. Len == 0 means "as much as possible" */

endbyte = offset + len;

if (!len || endbyte < len)

endbyte = -1;

else

endbyte--; /* inclusive */

bdi = mapping->backing_dev_info;

switch (advice) {

case POSIX_FADV_NORMAL:

file->f_ra.ra_pages = bdi->ra_pages;

break;

case POSIX_FADV_RANDOM:

file->f_ra.ra_pages = 0;

break;

case POSIX_FADV_SEQUENTIAL:

file->f_ra.ra_pages = bdi->ra_pages * 2;

break;

case POSIX_FADV_WILLNEED:

if (!mapping->a_ops->readpage) {

ret = -EINVAL;

break;

}

/* First and last PARTIAL page! */

start_index = offset >> PAGE_CACHE_SHIFT;

end_index = endbyte >> PAGE_CACHE_SHIFT;

/* Careful about overflow on the "+1" */

nrpages = end_index - start_index + 1;

if (!nrpages)

nrpages = ~0UL;

// 强制预读某部分的页

ret = force_page_cache_readahead(mapping, file,

start_index,

max_sane_readahead(nrpages));

if (ret > 0)

ret = 0;

break;

case POSIX_FADV_NOREUSE:

break;

case POSIX_FADV_DONTNEED:

if (!bdi_write_congested(mapping->backing_dev_info))

filemap_flush(mapping);

/* First and last FULL page! */

start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;

end_index = (endbyte >> PAGE_CACHE_SHIFT);

if (end_index >= start_index)

invalidate_mapping_pages(mapping, start_index,

end_index);

break;

default:

ret = -EINVAL;

}

out:

fput(file);

return ret;

}

2.2. 初始化

当一个文件被打开时，在它的 file_ra_state 描述符中，仅设置两个参数：

prev_page 为进程上一次读操作中的最后一页索引，初值为 -1

ra_pages 为当前窗的最大页数，即对文件允许的最大预读量。存放在该文件所在设备的 backing_dev_info->ra_pages 中，应用程序通过系统调用 fadvise 中来设置策略

POSIX_FADV_NORMAL 最大预读量为缺失值 32

POSIX_FADV_SEQUENTIAL 最大预读量为缺失值两倍

POSIX_FADV_RANDOM 最大预读量为 0

* Initialise a struct file's readahead state. Assumes that the caller has

* memset *ra to zero.

void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)

{

ra->ra_pages = mapping->backing_dev_info->ra_pages;

ra->prev_page = -1;

}

2.3. page_cache_read_ahead

本函数是预读的主要流程，主要设置当前窗和预读窗两个窗口。

/**

* page_cache_readahead - generic adaptive readahead

* @mapping: address_space which holds the pagecache and I/O vectors

* @ra: file_ra_state which holds the readahead state

* @filp: passed on to ->readpage() and ->readpages()

* @offset: start offset into @mapping, in PAGE_CACHE_SIZE units

* @req_size: hint: total size of the read which the caller is performing in

* PAGE_CACHE_SIZE units

* page_cache_readahead() is the main function. If performs the adaptive

* readahead window size management and submits the readahead I/O.

* Note that @filp is purely used for passing on to the ->readpage[s]()

* handler: it may refer to a different file from @mapping (so we may not use

* @filp->f_mapping or @filp->f_dentry->d_inode here).

* Also, @ra may not be equal to &@filp->f_ra.

unsigned long

page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,

struct file *filp, pgoff_t offset, unsigned long req_size)

{

unsigned long max, newsize;

int sequential;

* We avoid doing extra work and bogusly perturbing the readahead

* window expansion logic.

if (offset == ra->prev_page && --req_size)

++offset;

/* Note that prev_page == -1 if it is a first read */

// 如果要读的页 offset 正好等于上次进程请求的最后一页，标记为顺序

// 初始化化时 ra->prev_page 等于 -1 ，也就是开始 offset 等于 0 时才有预读？

sequential = (offset == ra->prev_page + 1);

ra->prev_page = offset;

max = get_max_readahead(ra);

newsize = min(req_size, max);

/* No readahead or sub-page sized read or file already in cache */

// 没有文件预读或者文件已经在高速缓存内

if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))

goto out;

ra->prev_page += newsize - 1;

* Special case - first read at start of file. We'll assume it's

* a whole-file read and grow the window fast. Or detect first

* sequential access

/* 顺序读且当前窗为空（第一次读） */

if (sequential && ra->size == 0) {

ra->size = get_init_ra_size(newsize, max); // 获得初始化的当前窗的大小

ra->start = offset; // 设置当前窗的开始

// 对当前窗进行 I/O 操作

if (!blockable_page_cache_readahead(mapping, filp, offset,

ra->size, ra, 1))

goto out;

* If the request size is larger than our max readahead, we

* at least want to be sure that we get 2 IOs in flight and

* we know that we will definitly need the new I/O.

* once we do this, subsequent calls should be able to overlap

* IOs,* thus preventing stalls. so issue the ahead window

* immediately.

/* 如果请求的页数大于当前窗的最大值，创建一个预读窗，并对其进行 IO 操作 */

if (req_size >= max)

make_ahead_window(mapping, filp, ra, 1);

goto out;

}

* Now handle the random case:

* partial page reads and first access were handled above,

* so this must be the next page otherwise it is random

如果不是顺序的，则关闭预读

if (!sequential) {

ra_off(ra);

blockable_page_cache_readahead(mapping, filp, offset,

newsize, ra, 1);

goto out;

}

* If we get here we are doing sequential IO and this was not the first

* occurence (ie we have an existing window)

顺序读且不是一个首次读，没有预读窗

if (ra->ahead_start == 0) { /* no ahead window yet */

if (!make_ahead_window( mapping, filp, ra, 0))

goto recheck;

}

* Already have an ahead window, check if we crossed into it.

* If so, shift windows and issue a new ahead window.

* Only return the #pages that are in the current window, so that

* we get called back on the first page of the ahead window which

* will allow us to submit more IO.

顺序读且不是一个首次读，有预读窗，且当前窗和预读窗有重叠

则预读窗为当前窗，重新构建预读窗

if (ra->prev_page >= ra->ahead_start) {

ra->start = ra->ahead_start;

ra->size = ra->ahead_size;

make_ahead_window(mapping, filp, ra, 0);

recheck:

/* prev_page shouldn't overrun the ahead window */

// 进程最后一次访问的页号必须小于预读窗

ra->prev_page = min(ra->prev_page,

ra->ahead_start + ra->ahead_size - 1);

}

out:

return ra->prev_page + 1;

}

预读操作的主要流程图如下：

get_init_ra_size

* Set the initial window size, round to next power of 2 and square

* for small size, x 4 for medium, and x 2 for large

* for 128k (32 page) max ra

* 1-8 page = 32k initial, > 8 page = 128k initial

/* 设置初始窗口大小，首先对齐到 2 的倍数 */

对于比较小，即小于最大值的 32 分之一，设置 4 倍

对于小于 4 分之一，设置为 2 倍

否则，直接设为最大值

static unsigned long get_init_ra_size(unsigned long size, unsigned long max)

{

unsigned long newsize = roundup_pow_of_two(size);

if (newsize <= max / 32)

newsize = newsize * 4;

else if (newsize <= max / 4)

newsize = newsize * 2;

else

newsize = max;

return newsize;

}

make_ahead_window

static int make_ahead_window(struct address_space *mapping, struct file *filp,

struct file_ra_state *ra, int force)

{

int block, ret;

// 获得下一个预读窗的大小

ra->ahead_size = get_next_ra_size (ra);

// 预读窗的开始为当前窗的结尾

ra->ahead_start = ra->start + ra->size;

// 强制执行或者进程最后请求的页在预读窗内

block = force || (ra->prev_page >= ra->ahead_start);

ret = blockable_page_cache_readahead(mapping, filp,

ra->ahead_start, ra->ahead_size, ra, block);

// 如果没有成功，且不是强制执行，则复位预读窗

if (!ret && !force) {

/* A read failure in blocking mode, implies pages are

* all cached. So we can safely assume we have taken

* care of all the pages requested in this call.

* A read failure in non-blocking mode, implies we are

* reading more pages than requested in this call. So

* we safely assume we have taken care of all the pages

* requested in this call.

* Just reset the ahead window in case we failed due to

* congestion. The ahead window will any way be closed

* in case we failed due to excessive page cache hits.

reset_ahead_window(ra);

}

return ret;

}

get_next_ra_size

* Set the new window size, this is called only when I/O is to be submitted,

* not for each call to readahead. If a cache miss occured, reduce next I/O

* size, else increase depending on how close to max we are.

static inline unsigned long get_next_ra_size(struct file_ra_state *ra)

{

unsigned long max = get_max_readahead(ra); // 默认最大 32 页

unsigned long min = get_min_readahead(ra); // 最小 4 页

unsigned long cur = ra->size;

unsigned long newsize;

if (ra->flags & RA_FLAG_MISS) { // 如果有没有命中的标志

ra->flags &= ~RA_FLAG_MISS; // 取消该标志

newsize = max((cur - 2), min); // 当前窗的值减 2 ，预读减少

} else if (cur < max / 16) {

newsize = 4 * cur; // 当前窗的 4 倍

} else {

newsize = 2 * cur; // 当前窗的 2 倍

}

return min(newsize, max);

}

handle_ra_miss

* handle_ra_miss() is called when it is known that a page which should have

* been present in the pagecache (we just did some readahead there) was in fact

* not found. This will happen if it was evicted by the VM (readahead

* thrashing)

* Turn on the cache miss flag in the RA struct, this will cause the RA code

* to reduce the RA size on the next read.

void handle_ra_miss(struct address_space *mapping,

struct file_ra_state *ra, pgoff_t offset)

{

ra->flags |= RA_FLAG_MISS;

ra->flags &= ~RA_FLAG_INCACHE;

ra->cache_hit = 0;

}

blockable_page_cache_readahead

* Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'

* is set wait till the read completes. Otherwise attempt to read without

* blocking.

* Returns 1 meaning 'success' if read is successful without switching off

* readahead mode. Otherwise return failure.

static int

blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,

pgoff_t offset, unsigned long nr_to_read,

struct file_ra_state *ra, int block)

{

int actual;

if (!block && bdi_read_congested(mapping->backing_dev_info))

return 0;

actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);

return check_ra_success(ra, nr_to_read, actual);

}

check_ra_success

* Check how effective readahead is being. If the amount of started IO is

* less than expected then the file is partly or fully in pagecache and

* readahead isn't helping.

static inline int check_ra_success(struct file_ra_state *ra,

unsigned long nr_to_read, unsigned long actual)

{

// 如果预读成功，且如请求的页大于 256 页，则关闭预读，设置全部在缓存的标志。

if (actual == 0) {

ra->cache_hit += nr_to_read;

if (ra->cache_hit >= VM_MAX_CACHE_HIT) {

ra_off(ra);

ra->flags |= RA_FLAG_INCACHE;

return 0;

}

} else {

ra->cache_hit=0;

}

return 1;

}

__do_page_cache_readahead

* Readahead design.

* The fields in struct file_ra_state represent the most-recently-executed

* readahead attempt:

* start: Page index at which we started the readahead

* size: Number of pages in that read

* Together, these form the "current window".

* Together, start and size represent the `readahead window'.

* prev_page: The page which the readahead algorithm most-recently inspected.

* It is mainly used to detect sequential file reading.

* If page_cache_readahead sees that it is again being called for

* a page which it just looked at, it can return immediately without

* making any state changes.

* ahead_start,

* ahead_size: Together, these form the "ahead window".

* ra_pages: The externally controlled max readahead for this fd.

* When readahead is in the off state (size == 0), readahead is disabled.

* In this state, prev_page is used to detect the resumption of sequential I/O.

* The readahead code manages two windows - the "current" and the "ahead"

* windows. The intent is that while the application is walking the pages

* in the current window, I/O is underway on the ahead window. When the

* current window is fully traversed, it is replaced by the ahead window

* and the ahead window is invalidated. When this copying happens, the

* new current window's pages are probably still locked. So

* we submit a new batch of I/O immediately, creating a new ahead window.

* So:

* ----|----------------|----------------|-----

* ^start ^start+size

* ^ahead_start ^ahead_start+ahead_size

* ^ When this page is read, we submit I/O for the

* ahead window.

* A `readahead hit' occurs when a read request is made against a page which is

* the next sequential page. Ahead window calculations are done only when it

* is time to submit a new IO. The code ramps up the size agressively at first,

* but slow down as it approaches max_readhead.

* Any seek/ramdom IO will result in readahead being turned off. It will resume

* at the first sequential access.

* There is a special-case: if the first page which the application tries to

* read happens to be the first page of the file, it is assumed that a linear

* read is about to happen and the window is immediately set to the initial size

* based on I/O request size and the max_readahead.

* This function is to be called for every read request, rather than when

* it is time to perform readahead. It is called only once for the entire I/O

* regardless of size unless readahead is unable to start enough I/O to satisfy

* the request (I/O request > max_readahead).

* do_page_cache_readahead actually reads a chunk of disk. It allocates all

* the pages first, then submits them all for I/O. This avoids the very bad

* behaviour which would occur if page allocations are causing VM writeback.

* We really don't want to intermingle reads and writes like that.

* Returns the number of pages requested, or the maximum amount of I/O allowed.

* do_page_cache_readahead() returns -1 if it encountered request queue

* congestion.

static int

__do_page_cache_readahead(struct address_space *mapping, struct file *filp,

pgoff_t offset, unsigned long nr_to_read)

{

struct inode *inode = mapping->host;

struct page *page;

unsigned long end_index; /* The last page we want to read */

LIST_HEAD(page_pool);

int page_idx;

int ret = 0;

loff_t isize = i_size_read(inode);

if (isize == 0)

goto out;

end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);

* Preallocate as many pages as we will need.

read_lock_irq(&mapping->tree_lock);

for (page_idx = 0; page_idx < nr_to_read; page_idx++) {

pgoff_t page_offset = offset + page_idx;

if (page_offset > end_index)

break;

// 如果已在缓存中，跳过

page = radix_tree_lookup(&mapping->page_tree, page_offset);

if (page)

continue;

read_unlock_irq(&mapping->tree_lock);

// 分配新的页框

page = page_cache_alloc_cold(mapping);

read_lock_irq(&mapping->tree_lock);

if (!page)

break;

page->index = page_offset;

list_add(&page->lru, &page_pool);

ret++;

}

read_unlock_irq(&mapping->tree_lock);

* Now start the IO. We ignore I/O errors - if the page is not

* uptodate then the caller will launch readpage again, and

* will then handle the error.

if (ret)

read_pages(mapping, filp, &page_pool, ret);

BUG_ON(!list_empty(&page_pool));

out:

return ret;

}

read_pages

static int read_pages(struct address_space *mapping, struct file *filp,

struct list_head *pages, unsigned nr_pages)

{

unsigned page_idx;

struct pagevec lru_pvec;

int ret;

if (mapping->a_ops->readpages) {

ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);

goto out;

}

pagevec_init(&lru_pvec, 0);

for (page_idx = 0; page_idx < nr_pages; page_idx++) {

struct page *page = list_to_page(pages);

list_del(&page->lru);

if (!add_to_page_cache(page, mapping,

page->index, GFP_KERNEL)) {

mapping->a_ops->readpage(filp, page);

if (!pagevec_add(&lru_pvec, page))

__pagevec_lru_add(&lru_pvec);

} else

page_cache_release(page);

}

pagevec_lru_add(&lru_pvec);

ret = 0;

out:

return ret;

}

Jack-changtao

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫