文件的预读操作分析

 

文件的预读操作分析

预读的两个优势:

1 )每个 IO 一般涉及一组较大相邻的扇区,预读使磁盘控制器处理较少的命令。

2 )提高系统的响应能力,顺序读取的文件一般不需要等待请求的数据

 

文件的顺序访问:如果请求的第一页紧跟上次访问所请求的最后一页,就认为是顺序的。

 

1.   主要的数据结构

/*

  * Track a single file's readahead state

  */

struct file_ra_state {

         unsigned long start;                   /* Current window */

         unsigned long size;

         unsigned long flags;          /* ra flags RA_FLAG_xxx*/

         unsigned long cache_hit;          /* cache hit count*/

         unsigned long prev_page;        /* Cache last read() position */

         unsigned long ahead_start;     /* Ahead window */

         unsigned long ahead_size;

         unsigned long ra_pages;           /* Maximum readahead window */

         unsigned long mmap_hit;                  /* Cache hit stat for mmap accesses */

         unsigned long mmap_miss;      /* Cache miss stat for mmap accesses */

};

#define RA_FLAG_MISS 0x01  /* a cache miss occured against this file */

#define RA_FLAG_INCACHE 0x02    /* file is already in cache */

2.   主要流程

2.1.  系统调用:设置预取策略

asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice)

{

         return sys_fadvise64_64(fd, offset, len, advice);

}

 

/*

  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could

  * deactivate the pages and clear PG_Referenced.

  */

asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)

{

         struct file *file = fget(fd);

         struct address_space *mapping;

         struct backing_dev_info *bdi;

         loff_t endbyte;                            /* inclusive */

         pgoff_t start_index;

         pgoff_t end_index;

         unsigned long nrpages;

         int ret = 0;

 

         if (!file)

                   return -EBADF;

 

         if (S_ISFIFO(file->f_dentry->d_inode->i_mode)) {

                   ret = -ESPIPE;

                   goto out;

         }

 

         mapping = file->f_mapping;

         if (!mapping || len < 0) {

                   ret = -EINVAL;

                   goto out;

         }

 

         if (mapping->a_ops->get_xip_page)

                   /* no bad return value, but ignore advice */

                   goto out;

 

         /* Careful about overflows. Len == 0 means "as much as possible" */

         endbyte = offset + len;

         if (!len || endbyte < len)

                   endbyte = -1;

         else

                   endbyte--;                  /* inclusive */

 

         bdi = mapping->backing_dev_info;

 

         switch (advice) {

         case POSIX_FADV_NORMAL:

                   file->f_ra.ra_pages = bdi->ra_pages;

                   break;

         case POSIX_FADV_RANDOM:

                   file->f_ra.ra_pages = 0;

                   break;

         case POSIX_FADV_SEQUENTIAL:

                   file->f_ra.ra_pages = bdi->ra_pages * 2;

                   break;

         case POSIX_FADV_WILLNEED:

                   if (!mapping->a_ops->readpage) {

                            ret = -EINVAL;

                            break;

                   }

 

                   /* First and last PARTIAL page! */

                   start_index = offset >> PAGE_CACHE_SHIFT;

                   end_index = endbyte >> PAGE_CACHE_SHIFT;

 

                   /* Careful about overflow on the "+1" */

                   nrpages = end_index - start_index + 1;

                   if (!nrpages)

                            nrpages = ~0UL;

                   // 强制预读某部分的页

                   ret = force_page_cache_readahead(mapping, file,

                                     start_index,

                                     max_sane_readahead(nrpages));

                   if (ret > 0)

                            ret = 0;

                   break;

         case POSIX_FADV_NOREUSE:

                   break;

         case POSIX_FADV_DONTNEED:

                   if (!bdi_write_congested(mapping->backing_dev_info))

                            filemap_flush(mapping);

 

                   /* First and last FULL page! */

                   start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;

                   end_index = (endbyte >> PAGE_CACHE_SHIFT);

 

                   if (end_index >= start_index)

                            invalidate_mapping_pages(mapping, start_index,

                                                        end_index);

                   break;

         default:

                   ret = -EINVAL;

         }

out:

         fput(file);

         return ret;

}

2.2.  初始化

当一个文件被打开时,在它的 file_ra_state 描述符中,仅设置两个参数:

prev_page 为进程上一次读操作中的最后一页索引,初值为 -1

ra_pages 为当前窗的最大页数,即对文件允许的最大预读量。存放在该文件所在设备的 backing_dev_info->ra_pages 中,应用程序通过系统调用 fadvise 中来设置策略

POSIX_FADV_NORMAL                  最大预读量为缺失值 32

POSIX_FADV_SEQUENTIAL      最大预读量为缺失值两倍

POSIX_FADV_RANDOM                 最大预读量为 0

/*

  * Initialise a struct file's readahead state.  Assumes that the caller has

  * memset *ra to zero.

  */

void file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)

{

         ra->ra_pages = mapping->backing_dev_info->ra_pages;

         ra->prev_page = -1;

}

 

2.3.  page_cache_read_ahead

本函数是预读的主要流程,主要设置当前窗和预读窗两个窗口。

/**

  * page_cache_readahead - generic adaptive readahead

  * @mapping: address_space which holds the pagecache and I/O vectors

  * @ra: file_ra_state which holds the readahead state

  * @filp: passed on to ->readpage() and ->readpages()

  * @offset: start offset into @mapping, in PAGE_CACHE_SIZE units

  * @req_size: hint: total size of the read which the caller is performing in

  *             PAGE_CACHE_SIZE units

  *

  * page_cache_readahead() is the main function.  If performs the adaptive

  * readahead window size management and submits the readahead I/O.

  *

  * Note that @filp is purely used for passing on to the ->readpage[s]()

  * handler: it may refer to a different file from @mapping (so we may not use

  * @filp->f_mapping or @filp->f_dentry->d_inode here).

  * Also, @ra may not be equal to &@filp->f_ra.

  *

  */

unsigned long

page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,

                        struct file *filp, pgoff_t offset, unsigned long req_size)

{

         unsigned long max, newsize;

         int sequential;

         /*

           * We avoid doing extra work and bogusly perturbing the readahead

           * window expansion logic.

           */

                  if (offset == ra->prev_page && --req_size)

                   ++offset;

 

         /* Note that prev_page == -1 if it is a first read */

// 如果要读的页 offset 正好等于上次进程请求的最后一页,标记为顺序

// 初始化化时 ra->prev_page 等于 -1 ,也就是开始 offset 等于 0 时才有预读?

         sequential = (offset == ra->prev_page + 1);

         ra->prev_page = offset;

 

         max = get_max_readahead(ra);

         newsize = min(req_size, max);

 

         /* No readahead or sub-page sized read or file already in cache */

         // 没有文件预读或者文件已经在高速缓存内

         if (newsize == 0 || (ra->flags & RA_FLAG_INCACHE))

                   goto out;

 

         ra->prev_page += newsize - 1;

 

         /*

           * Special case - first read at start of file. We'll assume it's

           * a whole-file read and grow the window fast.  Or detect first

           * sequential access

           */

         /* 顺序读且当前窗为空(第一次读) */

         if (sequential && ra->size == 0) {

                   ra->size = get_init_ra_size(newsize, max);  // 获得初始化的当前窗的大小

                   ra->start = offset;                                                   // 设置当前窗的开始

         // 对当前窗进行 I/O 操作

                   if (!blockable_page_cache_readahead(mapping, filp, offset,

                                                                   ra->size, ra, 1))

                            goto out;

 

                   /*

                     * If the request size is larger than our max readahead, we

                     * at least want to be sure that we get 2 IOs in flight and

                     * we know that we will definitly need the new I/O.

                     * once we do this, subsequent calls should be able to overlap

                     * IOs,* thus preventing stalls. so issue the ahead window

                     * immediately.

                     */

                   /* 如果请求的页数大于当前窗的最大值,创建一个预读窗,并对其进行 IO 操作 */

                   if (req_size >= max)

                            make_ahead_window(mapping, filp, ra, 1);

 

                   goto out;

         }

 

         /*

           * Now handle the random case:

           * partial page reads and first access were handled above,

           * so this must be the next page otherwise it is random

           */

         如果不是顺序的,则关闭预读

         if (!sequential) {

                   ra_off(ra);

                   blockable_page_cache_readahead(mapping, filp, offset,

                                       newsize, ra, 1);

                   goto out;

         }

 

         /*

           * If we get here we are doing sequential IO and this was not the first

           * occurence (ie we have an existing window)

           */

    顺序读且不是一个首次读,没有预读窗

         if (ra->ahead_start == 0) {        /* no ahead window yet */

                   if (!make_ahead_window( mapping, filp, ra, 0))

                            goto recheck;

         }

 

         /*

           * Already have an ahead window, check if we crossed into it.

           * If so, shift windows and issue a new ahead window.

           * Only return the #pages that are in the current window, so that

           * we get called back on the first page of the ahead window which

           * will allow us to submit more IO.

           */

  顺序读且不是一个首次读,有预读窗,且当前窗和预读窗有重叠

    则预读窗为当前窗,重新构建预读窗

         if (ra->prev_page >= ra->ahead_start) {

                   ra->start = ra->ahead_start;

                   ra->size = ra->ahead_size;

                   make_ahead_window(mapping, filp, ra, 0);

recheck:

                   /* prev_page shouldn't overrun the ahead window */

                  // 进程最后一次访问的页号必须小于预读窗

                   ra->prev_page = min(ra->prev_page,

                            ra->ahead_start + ra->ahead_size - 1);

         }

 

out:

         return ra->prev_page + 1;

}

 

预读操作的主要流程图如下:

get_init_ra_size

/*

  * Set the initial window size, round to next power of 2 and square

  * for small size, x 4 for medium, and x 2 for large

  * for 128k (32 page) max ra

  * 1-8 page = 32k initial, > 8 page = 128k initial

  */

    /* 设置初始窗口大小,首先对齐到 2 的倍数 */

  对于比较小,即小于最大值的 32 分之一,设置 4

  对于小于 4 分之一,设置为 2

  否则,直接设为最大值

static unsigned long get_init_ra_size(unsigned long size, unsigned long max)

{

         unsigned long newsize = roundup_pow_of_two(size);

 

         if (newsize <= max / 32)

                   newsize = newsize * 4;

         else if (newsize <= max / 4)

                   newsize = newsize * 2;

         else

                   newsize = max;

         return newsize;

}

 

make_ahead_window

static int make_ahead_window(struct address_space *mapping, struct file *filp,

                                     struct file_ra_state *ra, int force)

{

         int block, ret;

 

   // 获得下一个预读窗的大小

         ra->ahead_size = get_next_ra_size (ra);

         // 预读窗的开始为当前窗的结尾

         ra->ahead_start = ra->start + ra->size;

 

  // 强制执行或者进程最后请求的页在预读窗内

         block = force || (ra->prev_page >= ra->ahead_start);

         ret = blockable_page_cache_readahead(mapping, filp,

                            ra->ahead_start, ra->ahead_size, ra, block);

   // 如果没有成功,且不是强制执行,则复位预读窗

         if (!ret && !force) {

                   /* A read failure in blocking mode, implies pages are

                     * all cached. So we can safely assume we have taken

                     * care of all the pages requested in this call.

                     * A read failure in non-blocking mode, implies we are

                     * reading more pages than requested in this call.  So

                     * we safely assume we have taken care of all the pages

                     * requested in this call.

                     *

                     * Just reset the ahead window in case we failed due to

                     * congestion.  The ahead window will any way be closed

                     * in case we failed due to excessive page cache hits.

                     */

                   reset_ahead_window(ra);

         }

 

         return ret;

}

 

get_next_ra_size

/*

  * Set the new window size, this is called only when I/O is to be submitted,

  * not for each call to readahead.  If a cache miss occured, reduce next I/O

  * size, else increase depending on how close to max we are.

  */

static inline unsigned long get_next_ra_size(struct file_ra_state *ra)

{

         unsigned long max = get_max_readahead(ra);   // 默认最大 32

         unsigned long min = get_min_readahead(ra);   // 最小 4

         unsigned long cur = ra->size;

         unsigned long newsize;

 

         if (ra->flags & RA_FLAG_MISS) {                         // 如果有没有命中的标志

                   ra->flags &= ~RA_FLAG_MISS;                  // 取消该标志

                   newsize = max((cur - 2), min);                    // 当前窗的值减 2 ,预读减少

         } else if (cur < max / 16) {

                   newsize = 4 * cur;                                          // 当前窗的 4

         } else {

                   newsize = 2 * cur;                                         // 当前窗的 2

         }

         return min(newsize, max);

}

 

handle_ra_miss

/*

  * handle_ra_miss() is called when it is known that a page which should have

  * been present in the pagecache (we just did some readahead there) was in fact

  * not found.  This will happen if it was evicted by the VM (readahead

  * thrashing)

  *

  * Turn on the cache miss flag in the RA struct, this will cause the RA code

  * to reduce the RA size on the next read.

  */

void handle_ra_miss(struct address_space *mapping,

                   struct file_ra_state *ra, pgoff_t offset)

{

         ra->flags |= RA_FLAG_MISS;

         ra->flags &= ~RA_FLAG_INCACHE;

         ra->cache_hit = 0;

}

 

blockable_page_cache_readahead

 

/*

  * Read 'nr_to_read' pages starting at page 'offset'. If the flag 'block'

  * is set wait till the read completes.  Otherwise attempt to read without

  * blocking.

  * Returns 1 meaning 'success' if read is successful without switching off

  * readahead mode. Otherwise return failure.

  */

static int

blockable_page_cache_readahead(struct address_space *mapping, struct file *filp,

                            pgoff_t offset, unsigned long nr_to_read,

                            struct file_ra_state *ra, int block)

{

         int actual;

 

         if (!block && bdi_read_congested(mapping->backing_dev_info))

                   return 0;

 

         actual = __do_page_cache_readahead(mapping, filp, offset, nr_to_read);

 

         return check_ra_success(ra, nr_to_read, actual);

}

 

check_ra_success

/*

  * Check how effective readahead is being.  If the amount of started IO is

  * less than expected then the file is partly or fully in pagecache and

  * readahead isn't helping.

  *

  */

static inline int check_ra_success(struct file_ra_state *ra,

                            unsigned long nr_to_read, unsigned long actual)

{

    // 如果预读成功,且如请求的页大于 256 页,则关闭预读,设置全部在缓存的标志。

         if (actual == 0) {

                   ra->cache_hit += nr_to_read;                 

                   if (ra->cache_hit >= VM_MAX_CACHE_HIT) {

                            ra_off(ra);

                            ra->flags |= RA_FLAG_INCACHE;

                            return 0;

                   }

         } else {

                   ra->cache_hit=0;

         }

         return 1;

}

__do_page_cache_readahead

/*

  * Readahead design.

  *

  * The fields in struct file_ra_state represent the most-recently-executed

  * readahead attempt:

  *

  * start:   Page index at which we started the readahead

  * size:     Number of pages in that read

  *              Together, these form the "current window".

  *              Together, start and size represent the `readahead window'.

  * prev_page:   The page which the readahead algorithm most-recently inspected.

  *              It is mainly used to detect sequential file reading.

  *              If page_cache_readahead sees that it is again being called for

  *              a page which it just looked at, it can return immediately without

  *              making any state changes.

  * ahead_start,

  * ahead_size:  Together, these form the "ahead window".

  * ra_pages:    The externally controlled max readahead for this fd.

  *

  * When readahead is in the off state (size == 0), readahead is disabled.

  * In this state, prev_page is used to detect the resumption of sequential I/O.

  *

  * The readahead code manages two windows - the "current" and the "ahead"

  * windows.  The intent is that while the application is walking the pages

  * in the current window, I/O is underway on the ahead window.  When the

  * current window is fully traversed, it is replaced by the ahead window

  * and the ahead window is invalidated.  When this copying happens, the

  * new current window's pages are probably still locked.  So

  * we submit a new batch of I/O immediately, creating a new ahead window.

  *

  * So:

  *

  *   ----|----------------|----------------|-----

  *       ^start           ^start+size

  *                        ^ahead_start     ^ahead_start+ahead_size

  *

  *         ^ When this page is read, we submit I/O for the

  *           ahead window.

  *

  * A `readahead hit' occurs when a read request is made against a page which is

  * the next sequential page. Ahead window calculations are done only when it

  * is time to submit a new IO.  The code ramps up the size agressively at first,

  * but slow down as it approaches max_readhead.

  *

  * Any seek/ramdom IO will result in readahead being turned off.  It will resume

  * at the first sequential access.

  *

  * There is a special-case: if the first page which the application tries to

  * read happens to be the first page of the file, it is assumed that a linear

  * read is about to happen and the window is immediately set to the initial size

  * based on I/O request size and the max_readahead.

  *

  * This function is to be called for every read request, rather than when

  * it is time to perform readahead.  It is called only once for the entire I/O

  * regardless of size unless readahead is unable to start enough I/O to satisfy

  * the request (I/O request > max_readahead).

  */

 

/*

  * do_page_cache_readahead actually reads a chunk of disk.  It allocates all

  * the pages first, then submits them all for I/O. This avoids the very bad

  * behaviour which would occur if page allocations are causing VM writeback.

  * We really don't want to intermingle reads and writes like that.

  *

  * Returns the number of pages requested, or the maximum amount of I/O allowed.

  *

  * do_page_cache_readahead() returns -1 if it encountered request queue

  * congestion.

  */

static int

__do_page_cache_readahead(struct address_space *mapping, struct file *filp,

                            pgoff_t offset, unsigned long nr_to_read)

{

         struct inode *inode = mapping->host;

         struct page *page;

         unsigned long end_index;         /* The last page we want to read */

         LIST_HEAD(page_pool);

         int page_idx;

         int ret = 0;

         loff_t isize = i_size_read(inode);

 

         if (isize == 0)

                   goto out;

 

         end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);

 

         /*

           * Preallocate as many pages as we will need.

           */

         read_lock_irq(&mapping->tree_lock);

         for (page_idx = 0; page_idx < nr_to_read; page_idx++) {

                   pgoff_t page_offset = offset + page_idx;

                  

                   if (page_offset > end_index)

                            break;

                   // 如果已在缓存中,跳过

                   page = radix_tree_lookup(&mapping->page_tree, page_offset);

                   if (page)

                            continue;

 

                   read_unlock_irq(&mapping->tree_lock);

                   // 分配新的页框

                   page = page_cache_alloc_cold(mapping);

                   read_lock_irq(&mapping->tree_lock);

                   if (!page)

                            break;

                   page->index = page_offset;

                   list_add(&page->lru, &page_pool);

                   ret++;

         }

         read_unlock_irq(&mapping->tree_lock);

 

         /*

           * Now start the IO.  We ignore I/O errors - if the page is not

           * uptodate then the caller will launch readpage again, and

           * will then handle the error.

           */

         if (ret)

                   read_pages(mapping, filp, &page_pool, ret);

         BUG_ON(!list_empty(&page_pool));

out:

         return ret;

}

 

read_pages

static int read_pages(struct address_space *mapping, struct file *filp,

                   struct list_head *pages, unsigned nr_pages)

{

         unsigned page_idx;

         struct pagevec lru_pvec;

         int ret;

 

         if (mapping->a_ops->readpages) {

                   ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);

                   goto out;

         }

 

         pagevec_init(&lru_pvec, 0);

         for (page_idx = 0; page_idx < nr_pages; page_idx++) {

                   struct page *page = list_to_page(pages);

                   list_del(&page->lru);

                   if (!add_to_page_cache(page, mapping,

                                               page->index, GFP_KERNEL)) {

                            mapping->a_ops->readpage(filp, page);

                            if (!pagevec_add(&lru_pvec, page))

                                     __pagevec_lru_add(&lru_pvec);

                   } else

                            page_cache_release(page);

         }

         pagevec_lru_add(&lru_pvec);

         ret = 0;

out:

         return ret;

}

 

 

展开阅读全文
©️2020 CSDN 皮肤主题: 大白 设计师: CSDN官方博客 返回首页
实付0元
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值