Linux 读文件 - readahead预读算法

顺序读场景

intmain

{

charc[ 4096];

intin = -1;

in = open( "news.txt", O_RDONLY);

intindex= 0;

while(read(in, &c, 4096) == 4096)

{

printf( "index: %d,len: %ld.\n",index, strlen(c));

memset(c, 0, sizeof(c));

index++;

}

数据结构

/*
 * Track a single file's readahead state
 */
struct file_ra_state {
	pgoff_t start;			/* where readahead started */
	unsigned int size;		/* # of readahead pages */
	unsigned int async_size;	/* do asynchronous readahead when
					   there are only # of pages ahead */

	unsigned int ra_pages;		/* Maximum readahead window */
	unsigned int mmap_miss;		/* Cache miss stat for mmap accesses */
	loff_t prev_pos;		/* Cache last read() position */
};

start: 开始预读的数据页索引,是指相对文件内的index。

size : 一共要预读多个少页面。

async_size: 如果当前预读的“存货”只剩async_size时,就会触发async readahead,这个值很重要,控着async readahead的时机。异步读触发时类似网络协议栈的滑动窗口,窗口会移动,也就是file_ra_state字段会更新,具体如何更新见下文。这里async readahead就是指generic_file_buffered_read函数中的:page_cache_async_readahead函数调用

Linux read的核心函数generic_file_buffered_read_nginux的博客-CSDN博客

ra_page: readahead窗口的最大值,类似网络协议栈滑动窗口,这个窗口有最大限制。

prev_pos : 上次读的postion,文件内偏移,主义单位时bytes。

注意:PageReadahead的page非常重要,因为一旦应用程序读取数据到这个page,就要触发异步预读(page_cache_async_readahead)

数据结构初始化:

代码:mm/readahead.c: ondemand_readahead(预读算法的实现函数)


/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *filp,
		bool hit_readahead_marker, pgoff_t index,
		unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	pgoff_t prev_index;

	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

	/*
	 * start of file
	 */
	if (!index)
		goto initial_readahead;

    ...

initial_readahead:
	ra->start = index;
    //req_size是指预读请求的大小数据页数量(每个数据页4K)
	ra->size = get_init_ra_size(req_size, max_pages);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

    ...
    //提交文件读取,触发io预读
	ra_submit(ra, mapping, filp);
}

/*
 * Set the initial window size, round to next power of 2 and square
 * for small size, x 4 for medium, and x 2 for large
 * for 128k (32 page) max ra
 * 1-8 page = 32k initial, > 8 page = 128k initial
 */

//第一个参数:请求读取的数据页数量,比如目前场景每次读取4K字节,size = 1
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
	unsigned long newsize = roundup_pow_of_two(size);

    //每次读取小于4K bytes, 预读大小是请求大小的4被,比如read(4096), newsize = 1 * 4 = 4;
	if (newsize <= max / 32)
		newsize = newsize * 4;

    //中等读取大小,即1pages < ra <= 8pages,预读大小设置成请求大小的2倍
    //比如read(4K) newsize = 4 * 2 = 8
	else if (newsize <= max / 4)
		newsize = newsize * 2;
    //大的读取大小,即每次读取大于8 pages,比如read(64K) newsize = max = 32 pages
	else
		newsize = max;

	return newsize;
}

根据《Linux read的核心函数generic_file_buffered_read_nginux的博客-CSDN博客》文章我们知道,顺序读取大概调用逻辑:

1. 触发同步预读(sync readahead)

2.异步异步读取(async readahead)

首次同步预读 - sync readahead

调用路径:generic_file_buffered_read

                         -->page_cache_sync_readahead

                                -->ondemand_readahead : intial_readahead

                                      --->ra_submit 向block layer请求io预读

首次文件头读进入ondemand_readahead的initial_readahead逻辑,计算file_ra_state值:

ra->start = 0; 因为是从文件头开始读取

ra->size = 4;

ra->async_size = 3;

ra->ra_pages = 32;

ra->prev_pos = -1;

根据ondemand_readahead中initial_readahead label处逻辑看,ra->size是由get_init_ra_size函数计算,该函数第一个参数是应用read的数据页(每个数据页4K)的数量,该场景每次读取4K bytes,相当于调用get_init_ra_size(1,32)返回4。

ra_submit向block layer请求io预读

/*
 * Submit IO for the read-ahead request in file_ra_state.
 */
static inline void ra_submit(struct file_ra_state *ra,
		struct address_space *mapping, struct file *filp)
{
	__do_page_cache_readahead(mapping, filp,
			ra->start, ra->size, ra->async_size);
}

/*
 * __do_page_cache_readahead() actually reads a chunk of disk.  It allocates
 * the pages first, then submits them for I/O. This avoids the very bad
 * behaviour which would occur if page allocations are causing VM writeback.
 * We really don't want to intermingle reads and writes like that.
 */
void __do_page_cache_readahead(struct address_space *mapping,
		struct file *file, pgoff_t index, unsigned long nr_to_read,
		unsigned long lookahead_size)
{
	struct inode *inode = mapping->host;
	loff_t isize = i_size_read(inode);
	pgoff_t end_index;	/* The last page we want to read */

	if (isize == 0)
		return;

	end_index = (isize - 1) >> PAGE_SHIFT;
	if (index > end_index)
		return;
	/* Don't read past the page containing the last byte of the file */
	if (nr_to_read > end_index - index)
		nr_to_read = end_index - index + 1;

	page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
			lookahead_size);
}

static inline gfp_t readahead_gfp_mask(struct address_space *x)
{
	return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN;
}


/**
 * page_cache_readahead_unbounded - Start unchecked readahead.
 * @mapping: File address space.
 * @file: This instance of the open file; used for authentication.
 * @index: First page index to read.
 * @nr_to_read: The number of pages to read.
 * @lookahead_size: Where to start the next readahead.
 *
 * This function is for filesystems to call when they want to start
 * readahead beyond a file's stated i_size.  This is almost certainly
 * not the function you want to call.  Use page_cache_async_readahead()
 * or page_cache_sync_readahead() instead.
 *
 * Context: File is referenced by caller.  Mutexes may be held by caller.
 * May sleep, but will not reenter filesystem to reclaim memory.
 */
void page_cache_readahead_unbounded(struct address_space *mapping,
		struct file *file, pgoff_t index, unsigned long nr_to_read,
		unsigned long lookahead_size)
{
	LIST_HEAD(page_pool);

    //注意这里添加了__GFP_NORETRY | __GFP_NOWARN,page内存申请不会进入慢路径
	gfp_t gfp_mask = readahead_gfp_mask(mapping);
	struct readahead_control rac = {
		.mapping = mapping,
		.file = file,
		._index = index,
	};
	unsigned long i;

	/*
	 * Partway through the readahead operation, we will have added
	 * locked pages to the page cache, but will not yet have submitted
	 * them for I/O.  Adding another page may need to allocate memory,
	 * which can trigger memory reclaim.  Telling the VM we're in
	 * the middle of a filesystem operation will cause it to not
	 * touch file-backed pages, preventing a deadlock.  Most (all?)
	 * filesystems already specify __GFP_NOFS in their mapping's
	 * gfp_mask, but let's be explicit here.
	 */
	unsigned int nofs = memalloc_nofs_save();

	/*
	 * Preallocate as many pages as we will need.
	 */
	for (i = 0; i < nr_to_read; i++) {
		struct page *page = xa_load(&mapping->i_pages, index + i);

		BUG_ON(index + i != rac._index + rac._nr_pages);

		if (page && !xa_is_value(page)) {
			/*
			 * Page already present?  Kick off the current batch
			 * of contiguous pages before continuing with the
			 * next batch.  This page may be the one we would
			 * have intended to mark as Readahead, but we don't
			 * have a stable reference to this page, and it's
			 * not worth getting one just for that.
			 */
			read_pages(&rac, &page_pool, true);
			continue;
		}

		page = __page_cache_alloc(gfp_mask);
		if (!page)
			break;
		if (mapping->a_ops->readpages) {
			page->index = index + i;
			list_add(&page->lru, &page_pool);
		} else if (add_to_page_cache_lru(page, mapping, index + i,
					gfp_mask) < 0) {
			put_page(page);
			read_pages(&rac, &page_pool, true);
			continue;
		}
        //设置PageReadahead标志,非常重要
		if (i == nr_to_read - lookahead_size)
			SetPageReadahead(page);
		rac._nr_pages++;
	}

	/*
	 * Now start the IO.  We ignore I/O errors - if the page is not
	 * uptodate then the caller will launch readpage again, and
	 * will then handle the error.
	 */
	read_pages(&rac, &page_pool, false);
	memalloc_nofs_restore(nofs);
}

上面由于预读了4个页面,所以下次generic_file_buffered_read for(;;)循环中find_get_page会找到cache page(因为被预读进cache中了),循环继续,每次循环增加index值,由于index增加,就会触发PageReadahead(page),进而调用page_cache_async_readahead:

这种设计是因为不能一直预读,因为预读失败会受到惩罚(浪费内存),而是要根据应用顺序读取到一定程度才进行新的预读,这个时机就是应用读取到PageReadahead(page)对应的page。page_cache_async_readahead-->ondemand_readahead触发新的“异步”预读:


/*
 * A minimal readahead algorithm for trivial sequential/random reads.
 */
static void ondemand_readahead(struct address_space *mapping,
		struct file_ra_state *ra, struct file *filp,
		bool hit_readahead_marker, pgoff_t index,
		unsigned long req_size)
{
	struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
	unsigned long max_pages = ra->ra_pages;
	unsigned long add_pages;
	pgoff_t prev_index;

	/*
	 * If the request exceeds the readahead window, allow the read to
	 * be up to the optimal hardware IO size
	 */
	if (req_size > max_pages && bdi->io_pages > max_pages)
		max_pages = min(req_size, bdi->io_pages);

	/*
	 * start of file
	 */
	if (!index)
		goto initial_readahead;

	/*
	 * It's the expected callback index, assume sequential access.
	 * Ramp up sizes, and push forward the readahead window.
	 */
     //@1
	if ((index == (ra->start + ra->size - ra->async_size) ||
	     index == (ra->start + ra->size))) {
		ra->start += ra->size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * Hit a marked page without valid readahead state.
	 * E.g. interleaved reads.
	 * Query the pagecache for async_size, which normally equals to
	 * readahead size. Ramp it up and use it as the new readahead size.
	 */
    //@2 没有命中@1,比如没有完整顺序读的情况,刚好跳过了@1中的条件
	if (hit_readahead_marker) {
		pgoff_t start;

		rcu_read_lock();
		start = page_cache_next_miss(mapping, index + 1, max_pages);
		rcu_read_unlock();

		if (!start || start - index > max_pages)
			return;

		ra->start = start;
		ra->size = start - index;	/* old async_size */
		ra->size += req_size;
		ra->size = get_next_ra_size(ra, max_pages);
		ra->async_size = ra->size;
		goto readit;
	}

	/*
	 * oversize read
	 */
	if (req_size > max_pages)
		goto initial_readahead;

	/*
	 * sequential cache miss
	 * trivial case: (index - prev_index) == 1
	 * unaligned reads: (index - prev_index) == 0
	 */
	prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
	if (index - prev_index <= 1UL)
		goto initial_readahead;

	/*
	 * Query the page cache and look for the traces(cached history pages)
	 * that a sequential stream would leave behind.
	 */
	if (try_context_readahead(mapping, ra, index, req_size, max_pages))
		goto readit;

	/*
	 * standalone, small random read
	 * Read as is, and do not pollute the readahead state.
	 */
    //@3随机读取,倒数第二个参数代表只读取用户要求数量的page数量,最后一个参数0代表不进行
    //预读
	__do_page_cache_readahead(mapping, filp, index, req_size, 0);
	return;

initial_readahead:
	ra->start = index;
	ra->size = get_init_ra_size(req_size, max_pages);
	ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;

readit:
	/*
	 * Will this read hit the readahead marker made by itself?
	 * If so, trigger the readahead marker hit now, and merge
	 * the resulted next readahead window into the current one.
	 * Take care of maximum IO pages as above.
	 */
	if (index == ra->start && ra->size == ra->async_size) {
		add_pages = get_next_ra_size(ra, max_pages);
		if (ra->size + add_pages <= max_pages) {
			ra->async_size = add_pages;
			ra->size += add_pages;
		} else {
			ra->size = max_pages;
			ra->async_size = max_pages >> 1;
		}
	}

	ra_submit(ra, mapping, filp);
}

@1对应顺序读取命中预读,比如我们当前场景预读了4个page,当读取到index = 1第二个page的时候就会命中该逻辑,会将“预读窗口”向前移动。

满足index == (ra->start + ra->size - ra->async_size)条件,所以向前移动预读窗口:

//start = 4
ra->start += ra->size; 
//根据get_next_ra_size函数,其中cur = ra->size = 4,所以return max =32,直接将预读窗口放大到了最大32
ra->size = get_next_ra_size(ra, max_pages);
//async_size = ra->size
ra->async_size = ra->size;


/*
 *  Get the previous window size, ramp it up, and
 *  return it as the new window size.
 */
static unsigned long get_next_ra_size(struct file_ra_state *ra,
				      unsigned long max)
{
	unsigned long cur = ra->size;

	if (cur < max / 16)
		return 4 * cur;
	if (cur <= max / 2)
		return 2 * cur;
	return max;
}

@2:没有触发1,但是触发异步readahead逻辑,同样继续新的预读逻辑,这里考虑到了多线程同时读一个文件的情况,ra结构体要在多线程之间完成状态切换,可看参考文章《Linux文件预读三》

参考文章:

Linux文件系统预读(一) - 知乎

Linux文件系统预读(二) - 知乎

Linux文件预读(三) - 知乎

深入分析Linux内核File cache机制(上篇) - 知乎

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
全志R16平台编译linux系统V1.0.txt 2017/4/11 13:36 (编译请使用编译android的lichee的选项编译生成的.config文件,不然直接编译会报错!!!!) rootroot@cm-System-Product-Name:/home/wwt/linux_r16$ tar zxvf lichee_parrotv1.1_20161202.tar.gz rootroot@cm-System-Product-Name:/home/wwt/linux_r16$ cd lichee/ rootroot@cm-System-Product-Name:/home/wwt/linux_r16/lichee$ ./build.sh config Welcome to mkscript setup progress All available chips: 0. sun8iw5p1 Choice: 0 All available platforms: 0. android 1. dragonboard 2. linux 3. tina Choice: 2 All available kernel: 0. linux-3.4 Choice: 0 All available boards: 0. bell-one 1. evb 2. evb-20 3. evb-30 4. evb-rtl8723bs 5. sc3813r Choice: 3 rootroot@cm-System-Product-Name:/home/wwt/linux_r16/lichee$ ./build.sh 错误1: KCONFIG_AUTOCONFIG=/home/wwt/linux_r16/lichee/out/sun8iw5p1/linux/common/buildroot/build/buildroot-config/auto.conf KCONFIG_AUTOHEADER=/home/wwt/linux_r16/lichee/out/sun8iw5p1/linux/common/buildroot/build/buildroot-config/autoconf.h KCONFIG_TRISTATE=/home/wwt/linux_r16/lichee/out/sun8iw5p1/linux/common/buildroot/build/buildroot-config/tristate.config BUILDROOT_CONFIG=/home/wwt/linux_r16/lichee/out/sun8iw5p1/linux/common/buildroot/.config /home/wwt/linux_r16/lichee/out/sun8iw5p1/linux/common/buildroot/build/buildroot-config/conf --silentoldconfig Config.in # # make dependencies written to .auto.deps # ATTENTION buildroot devels! # See top of this file before playing with this auto-preprequisites! # make[1]:正在离开目录 `/home/wwt/linux_r16/lichee/buildroot' You must install 'makeinfo' on your build machine makeinfo is usually part of the texinfo package in your distribution make: *** [dependencies] 错误 1 make:离开目录“/home/wwt/linux_r16/lichee/buildroot” ERROR: build buildroot Failed rootroot@cm-System-Product-Name:/home/wwt/linux_r16/lichee$ d/buildroot-config/conf.o /home/wwt/linux_r16/lichee/out/sun8iw5p1/linux/common/buildroot/build/buildroot-config/zconf.tab.o -o /home/wwt/linux_r16/lichee/out/sun8iw5p1/linux/common/buil

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值