【page cache】回写机制

当前内容基于 Linux Kernel v5.4.121

writeback 回写

page cache 简介 有过介绍

buffer IO 通过 page cache 进行缓存,减少对底层存储设备的直接读写,同时能够提高整体性能

写入到 page cache 的数据不会立刻写入后端设备,而是标记为“脏”,并被加入到脏页链表,后续由内核中的回写进程周期性的将脏页写回到底层存储设备

下面主要分析 page cache 回写机制的策略和实现

相关结构体

底层设备信息

include/linux/backing-dev-defs.h 中定义了 backing_dev_info 结构体,主要用与记录底层的设备信息

struct backing_dev_info {
	u64 id;
	struct rb_node rb_node; /* keyed by ->id */
	struct list_head bdi_list;
	unsigned long ra_pages;	/* max readahead in PAGE_SIZE units */
	unsigned long io_pages;	/* max allowed IO size */
	congested_fn *congested_fn; /* Function pointer if device is md/dm */
	void *congested_data;	/* Pointer to aux data for congested func */

	// 通常为 "block"
	const char *name;

	struct kref refcnt;	/* Reference counter for the structure */
	unsigned int capabilities; /* Device capabilities */
	unsigned int min_ratio;
	unsigned int max_ratio, max_prop_frac;

	/*
	 * Sum of avg_write_bw of wbs with dirty inodes.  > 0 if there are
	 * any dirty wbs, which is depended upon by bdi_has_dirty().
	 */
	atomic_long_t tot_write_bandwidth;

	struct bdi_writeback wb;  /* the root writeback info for this bdi */
	struct list_head wb_list; /* list of all wbs */
#ifdef CONFIG_CGROUP_WRITEBACK
	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
	struct rb_root cgwb_congested_tree; /* their congested states */
	struct mutex cgwb_release_mutex;  /* protect shutdown of wb structs */
	struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
#else
	struct bdi_writeback_congested *wb_congested;
#endif
	wait_queue_head_t wb_waitq;

	// bdi_class 设备
	struct device *dev;
	// 主设备号:次设备号
	char dev_name[64];
	// 实际的底层设备
	struct device *owner;

	struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS
	struct dentry *debug_dir;
#endif
};
初始化

该结构体只会由 mm/backing-dev.c 中的 bdi_alloc_node 函数来申请内存空间并调用 bdi_init 初始化

部分字段说明
  1. name 字段

    name 字段

    block/blk-core.cblk_alloc_queue_node 会调用 bdi_alloc_node 来初始化该结构体,其中 name 字段赋值为 "block"

  2. dev 字段

    mm/backing-dev.cbdi_register_va 会调用 device_create 创建一个 bdi_class 类型的设备,并赋值给 dev 字段

  3. dev_name 字段

    mm/backing-dev.cbdi_register_va 还会对 dev_name 进行赋值

    dev_name 字段

    根据调用栈溯源可以发现,mm/backing-dev.cbdi_register_ownerfmtargs 传递到 bdi_register_va,最终会将主设备号和次设备号拼接组合后进行赋值

  4. owner 字段

    mm/backing-dev.cbdi_register_owner 还会对 owner 进行赋值

    owner 字段

    实际赋值的为 disk 对应的 dev,通过 disk_to_dev 宏转换得到

设备回写管理

include/linux/backing-dev-defs.h 中定义了 bdi_writeback 结构体,用于管理一个块设备的回写,同时支持 cgroup 进行限制

/*
 * Each wb (bdi_writeback) can perform writeback operations, is measured
 * and throttled, independently.  Without cgroup writeback, each bdi
 * (bdi_writeback) is served by its embedded bdi->wb.
 *
 * On the default hierarchy, blkcg implicitly enables memcg.  This allows
 * using memcg's page ownership for attributing writeback IOs, and every
 * memcg - blkcg combination can be served by its own wb by assigning a
 * dedicated wb to each memcg, which enables isolation across different
 * cgroups and propagation of IO back pressure down from the IO layer upto
 * the tasks which are generating the dirty pages to be written back.
 *
 * A cgroup wb is indexed on its bdi by the ID of the associated memcg,
 * refcounted with the number of inodes attached to it, and pins the memcg
 * and the corresponding blkcg.  As the corresponding blkcg for a memcg may
 * change as blkcg is disabled and enabled higher up in the hierarchy, a wb
 * is tested for blkcg after lookup and removed from index on mismatch so
 * that a new wb for the combination can be created.
 */
struct bdi_writeback {
	struct backing_dev_info *bdi;	/* our parent bdi */

	unsigned long state;		/* Always use atomic bitops on this */
	unsigned long last_old_flush;	/* last old data flush */

	struct list_head b_dirty;	/* dirty inodes */
	struct list_head b_io;		/* parked for writeback */
	struct list_head b_more_io;	/* parked for more writeback */
	struct list_head b_dirty_time;	/* time stamps are dirty */
	spinlock_t list_lock;		/* protects the b_* lists */

	struct percpu_counter stat[NR_WB_STAT_ITEMS];

	struct bdi_writeback_congested *congested;

	unsigned long bw_time_stamp;	/* last time write bw is updated */
	unsigned long dirtied_stamp;
	unsigned long written_stamp;	/* pages written at bw_time_stamp */
	unsigned long write_bandwidth;	/* the estimated write bandwidth */
	unsigned long avg_write_bandwidth; /* further smoothed write bw, > 0 */

	/*
	 * The base dirty throttle rate, re-calculated on every 200ms.
	 * All the bdi tasks' dirty rate will be curbed under it.
	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit
	 * in small steps and is much more smooth/stable than the latter.
	 */
	unsigned long dirty_ratelimit;
	unsigned long balanced_dirty_ratelimit;

	struct fprop_local_percpu completions;
	int dirty_exceeded;
	enum wb_reason start_all_reason;

	spinlock_t work_lock;		/* protects work_list & dwork scheduling */
	struct list_head work_list;
	struct delayed_work dwork;	/* work item used for writeback */

	unsigned long dirty_sleep;	/* last wait */

	struct list_head bdi_node;	/* anchored at bdi->wb_list */

#ifdef CONFIG_CGROUP_WRITEBACK
	struct percpu_ref refcnt;	/* used only for !root wb's */
	struct fprop_local_percpu memcg_completions;
	struct cgroup_subsys_state *memcg_css; /* the associated memcg */
	struct cgroup_subsys_state *blkcg_css; /* and blkcg */
	struct list_head memcg_node;	/* anchored at memcg->cgwb_list */
	struct list_head blkcg_node;	/* anchored at blkcg->cgwb_list */

	union {
		struct work_struct release_work;
		struct rcu_head rcu;
	};
#endif
};
初始化

该结构体由 mm/backing-dev.c 中的 wb_init 函数初始化

bdi_init
cgwb_bdi_init
wb_init
部分字段说明
  1. b_dirty 字段

    暂存所有的脏 inode 的链表

  2. b_io 字段

    暂存即将回写的 inode 的链表

  3. b_more_io 字段

    暂存由于一次回写数量限制原因导致的等待下次回写的 inode 链表

  4. b_dirty_time 字段

    暂存仅仅是时间戳更新而被至脏的 inode 的链表

  5. list_lock 字段

    为了保护上述 4 个 b_* 列表的自旋锁

  6. dwork 字段

    用于 page cache 回写机制的 work 关键结构体

  7. work_list 字段

    暂存所有需要回写的任务的链表

  8. work_lock 字段

    为了保护 work_list 以及 dwork 调度的自旋锁

回写任务

fs/fs-writeback.c 中定义了 wb_writeback_work 结构体,用于描述一次回写任务的相关参数

/*
 * Passed into wb_writeback(), essentially a subset of writeback_control
 */
struct wb_writeback_work {
	// 本次回写的页数限制
	long nr_pages;
	// 回写的文件系统的超级块
	struct super_block *sb;
	// 回写的模式
	enum writeback_sync_modes sync_mode;
	// tag-and-write 机制标记
	unsigned int tagged_writepages:1;
	// 定期回写标记
	unsigned int for_kupdate:1;
	// 继续上次循环回写标记
	unsigned int range_cyclic:1;
	// 阈值回写标记
	unsigned int for_background:1;
	// sync 系统调用标记
	unsigned int for_sync:1;	/* sync(2) WB_SYNC_ALL writeback */
	unsigned int auto_free:1;	/* free on completion */
	enum wb_reason reason;		/* why was writeback initiated? */

	struct list_head list;		/* pending work list */
	struct wb_completion *done;	/* set if the caller waits */
};

后续通过 include/linux/writeback.h 中的 writeback_control 结构体封装,传递给底层的 writepages 函数

/*
 * A control structure which tells the writeback code what to do.  These are
 * always on the stack, and hence need no locking.  They are always initialised
 * in a manner such that unspecified fields are set to zero.
 */
struct writeback_control {
	long nr_to_write;		/* Write this many pages, and decrement
					   this for each page written */
	long pages_skipped;		/* Pages which were not written */

	/*
	 * For a_ops->writepages(): if start or end are non-zero then this is
	 * a hint that the filesystem need only write out the pages inside that
	 * byterange.  The byte at `end' is included in the writeout request.
	 */
	loff_t range_start;
	loff_t range_end;

	enum writeback_sync_modes sync_mode;

	unsigned for_kupdate:1;		/* A kupdate writeback */
	unsigned for_background:1;	/* A background writeback */
	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
	unsigned for_reclaim:1;		/* Invoked from the page allocator */
	unsigned range_cyclic:1;	/* range_start is cyclic */
	unsigned for_sync:1;		/* sync(2) WB_SYNC_ALL writeback */

	/*
	 * When writeback IOs are bounced through async layers, only the
	 * initial synchronous phase should be accounted towards inode
	 * cgroup ownership arbitration to avoid confusion.  Later stages
	 * can set the following flag to disable the accounting.
	 */
	unsigned no_cgroup_owner:1;

	unsigned punt_to_cgroup:1;	/* cgrp punting, see __REQ_CGROUP_PUNT */

#ifdef CONFIG_CGROUP_WRITEBACK
	struct bdi_writeback *wb;	/* wb this writeback is issued under */
	struct inode *inode;		/* inode being written out */

	/* foreign inode detection, see wbc_detach_inode() */
	int wb_id;			/* current wb id */
	int wb_lcand_id;		/* last foreign candidate wb id */
	int wb_tcand_id;		/* this foreign candidate wb id */
	size_t wb_bytes;		/* bytes written by current wb */
	size_t wb_lcand_bytes;		/* bytes written by last candidate */
	size_t wb_tcand_bytes;		/* bytes written by this candidate */
#endif
};
部分字段说明
  1. sync_mode 字段

    WB_SYNC_NONE:绝大部分回写任务的配置,不会等待回写真正落盘,下发写命令后就返回

    WB_SYNC_ALLsync 系统调用时配置,必须等待回调函数执行完成,写的数据真正落盘之后才会返回

  2. tagged_writepages 字段

    值为 1 表示开启 tag-and-write 机制用于避免活锁。该机制详情参考后续小节

  3. for_kupdate 字段

    值为 1 表示当前任务是定期回写任务,用于回写已经至脏超过指定时间(内核中当前配置为 30s)的脏页。定期回写详情参考后续小节

  4. range_cyclic 字段

    值为 1 表示当前任务的回写范围为整个 inode,并且从上次完成的位置作为起始位置进行循环回写。值为 0 则根据 struct writeback_control wbcrange_start 以及 range_end 作为回写的范围。range_cyclic 的详情参考后续小节

  5. for_background 字段

    值为 1 表示当前任务是阈值回写任务,当脏页比例超过阈值后才会触发。阈值回写详情参考后续小节

  6. for_sync 字段

    值为 1 表示当前任务是阈值回写任务 sync 系统调用手动触发的回写任务。sync 系统调用详情参考后续小节

回写线程

前面说过 bdi_writeback 结构体中 struct delayed_work dwork 就是关键的负责 page cache 回写工作的结构体

初始化

wb_init 函数会对 wb->dwork 赋值,注册实际的工作函数 wb_workfn

由于这是个 delayed_work,注册的工作函数不会立即执行,需要后续利用 mod_delayed_work 来修改 delayed_work 内置的定时器时间来唤醒

wb_init 调用图

(P.S. 图片中函数开头为 cg 代表和 cgroup 相关,同一层级多个同名函数和宏定义的编译控制有关)

根据函数调用图,大致分析可知,当设备申请 queue 时会初始化 backing_dev_info 结构体和 bdi_writeback 结构体,以及初始化回写线程

立即唤醒

虽然 dwork 是个 delayed_work,但是我们可以在调用 mod_delayed_work 将延时设置为 0,来立即唤醒回写线程

dwork 调用图

wb_wakeup

wb_wakeup 就是修改延时为 0,直接唤醒回写线程

// fs/fs-writeback.c
static void wb_wakeup(struct bdi_writeback *wb)
{
	spin_lock_bh(&wb->work_lock);
	if (test_bit(WB_registered, &wb->state))
		mod_delayed_work(bdi_wq, &wb->dwork, 0);
	spin_unlock_bh(&wb->work_lock);
}
wb_queue_work

wb_queue_work 将一个回写任务插入到队列尾部,然后修改延时为 0,立即唤醒回写线程

// fs/fs-writeback.c
static void wb_queue_work(struct bdi_writeback *wb,
			  struct wb_writeback_work *work)
{
	trace_writeback_queue(wb, work);

	if (work->done)
		atomic_inc(&work->done->cnt);

	spin_lock_bh(&wb->work_lock);

	if (test_bit(WB_registered, &wb->state)) {
		list_add_tail(&work->list, &wb->work_list);
		mod_delayed_work(bdi_wq, &wb->dwork, 0);
	} else
		finish_writeback_work(wb, work);

	spin_unlock_bh(&wb->work_lock);
}

定时唤醒

定时唤醒主要是由 wb_wakeup_delayed 来实现的,而时间间隔在 mm/page-writeback.c 进行了定义

// mm/page-writeback.c
/*
 * The interval between `kupdate'-style writebacks
 */
unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */

EXPORT_SYMBOL_GPL(dirty_writeback_interval);
// mm/backing-dev.c
/*
 * This function is used when the first inode for this wb is marked dirty. It
 * wakes-up the corresponding bdi thread which should then take care of the
 * periodic background write-out of dirty inodes. Since the write-out would
 * starts only 'dirty_writeback_interval' centisecs from now anyway, we just
 * set up a timer which wakes the bdi thread up later.
 *
 * Note, we wouldn't bother setting up the timer, but this function is on the
 * fast-path (used by '__mark_inode_dirty()'), so we save few context switches
 * by delaying the wake-up.
 *
 * We have to be careful not to postpone flush work if it is scheduled for
 * earlier. Thus we use queue_delayed_work().
 */
void wb_wakeup_delayed(struct bdi_writeback *wb)
{
	unsigned long timeout;

	timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
	spin_lock_bh(&wb->work_lock);
	if (test_bit(WB_registered, &wb->state))
		queue_delayed_work(bdi_wq, &wb->dwork, timeout);
	spin_unlock_bh(&wb->work_lock);
}

wb_wakeup_delayed 使用 5s 作为时间间隔,当调用 wb_wakeup_delayed 后,回写线程会在 5s 后被唤醒

wb_wakeup_delayed 调用图

定时唤醒只会在两种情形下被调用:

  • __mark_inode_dirty:当给一个 inode 标记为脏时,如果脏的不仅仅是时间戳,而且当前的 b_dirty 链表是空的,也就是说第一次将脏页挂在 b_dirty 链表时,开启定时唤醒
  • wb_workfn:当回写线程处理完 work_list 上的所有任务后,如果仍有脏 inodeb_{dirty|io|more_io} 上时,开启定时唤醒

简单的讲,就是只要存在脏 inodeb_{dirty|io|more_io} 上时,内核的回写线程每 5s 内肯定会被唤醒一次

释放销毁

release_bdi 调用图

当需要对整个 backing_dev_info 结构释放时,也会立即唤醒内核回写线程,并且会下刷现有的所有工作

release_bdi
bdi_unregister
wb_shutdown
wb_exit

细节分析

tag-and-write

该机制会在 write_pages 时先快速对下刷范围内的脏页进行标记,后续只对标记过的脏页进行下刷

首先回写任务的参数会通过 fs/fs-writeback.cwriteback_sb_inodes 函数传递给 struct writeback_control wbc

后续在 mm/page-writeback.cwrite_cache_pages 就会根据 wbctagged_writepages 字段进行判断,配置不同的 tag,以及是否需要快速遍历脏页并标记

/**
 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
 * @mapping: address space structure to write
 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
 * @writepage: function called for each page
 * @data: data passed to writepage function
 *
 * If a page is already under I/O, write_cache_pages() skips it, even
 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
 * and msync() need to guarantee that all the data which was dirty at the time
 * the call was made get new I/O started against them.  If wbc->sync_mode is
 * WB_SYNC_ALL then we were called for data integrity and we must wait for
 * existing IO to complete.
 *
 * To avoid livelocks (when other process dirties new pages), we first tag
 * pages which should be written back with TOWRITE tag and only then start
 * writing them. For data-integrity sync we have to be careful so that we do
 * not miss some pages (e.g., because some other process has cleared TOWRITE
 * tag we set). The rule we follow is that TOWRITE tag can be cleared only
 * by the process clearing the DIRTY tag (and submitting the page for IO).
 *
 * To avoid deadlocks between range_cyclic writeback and callers that hold
 * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
 * we do not loop back to the start of the file. Doing so causes a page
 * lock/page writeback access order inversion - we should only ever lock
 * multiple pages in ascending page->index order, and looping back to the start
 * of the file violates that rule and causes deadlocks.
 *
 * Return: %0 on success, negative error code otherwise
 */
int write_cache_pages(struct address_space *mapping,
		      struct writeback_control *wbc, writepage_t writepage,
		      void *data)
{
	int ret = 0;
	int done = 0;
	int error;
	struct pagevec pvec;
	int nr_pages;
	pgoff_t uninitialized_var(writeback_index);
	pgoff_t index;
	pgoff_t end;		/* Inclusive */
	pgoff_t done_index;
	int range_whole = 0;
	xa_mark_t tag;

	pagevec_init(&pvec);
	if (wbc->range_cyclic) {
		writeback_index = mapping->writeback_index; /* prev offset */
		index = writeback_index;
		end = -1;
	} else {
		index = wbc->range_start >> PAGE_SHIFT;
		end = wbc->range_end >> PAGE_SHIFT;
		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
			range_whole = 1;
	}
	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
		tag = PAGECACHE_TAG_TOWRITE;
	else
		tag = PAGECACHE_TAG_DIRTY;
	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
		tag_pages_for_writeback(mapping, index, end);
	done_index = index;
	while (!done && (index <= end)) {
		int i;

		nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
				tag);
		if (nr_pages == 0)
			break;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			done_index = page->index;

			lock_page(page);

			/*
			 * Page truncated or invalidated. We can freely skip it
			 * then, even for data integrity operations: the page
			 * has disappeared concurrently, so there could be no
			 * real expectation of this data interity operation
			 * even if there is now a new, dirty page at the same
			 * pagecache address.
			 */
			if (unlikely(page->mapping != mapping)) {
continue_unlock:
				unlock_page(page);
				continue;
			}

			if (!PageDirty(page)) {
				/* someone wrote it for us */
				goto continue_unlock;
			}

			if (PageWriteback(page)) {
				if (wbc->sync_mode != WB_SYNC_NONE)
					wait_on_page_writeback(page);
				else
					goto continue_unlock;
			}

			BUG_ON(PageWriteback(page));
			if (!clear_page_dirty_for_io(page))
				goto continue_unlock;

			trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
			error = (*writepage)(page, wbc, data);
			if (unlikely(error)) {
				/*
				 * Handle errors according to the type of
				 * writeback. There's no need to continue for
				 * background writeback. Just push done_index
				 * past this page so media errors won't choke
				 * writeout for the entire file. For integrity
				 * writeback, we must process the entire dirty
				 * set regardless of errors because the fs may
				 * still have state to clear for each page. In
				 * that case we continue processing and return
				 * the first error.
				 */
				if (error == AOP_WRITEPAGE_ACTIVATE) {
					unlock_page(page);
					error = 0;
				} else if (wbc->sync_mode != WB_SYNC_ALL) {
					ret = error;
					done_index = page->index + 1;
					done = 1;
					break;
				}
				if (!ret)
					ret = error;
			}

			/*
			 * We stop writing back only if we are not doing
			 * integrity sync. In case of integrity sync we have to
			 * keep going until we have written all the pages
			 * we tagged for writeback prior to entering this loop.
			 */
			if (--wbc->nr_to_write <= 0 &&
			    wbc->sync_mode == WB_SYNC_NONE) {
				done = 1;
				break;
			}
		}
		pagevec_release(&pvec);
		cond_resched();
	}

	/*
	 * If we hit the last page and there is more work to be done: wrap
	 * back the index back to the start of the file for the next
	 * time we are called.
	 */
	if (wbc->range_cyclic && !done)
		done_index = 0;
	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
		mapping->writeback_index = done_index;

	return ret;
}
EXPORT_SYMBOL(write_cache_pages);

fs/fs-writeback.cwriteback_chunk_size 里面的注释也对这个流程进行了描述

static long writeback_chunk_size(struct bdi_writeback *wb,
				 struct wb_writeback_work *work)
{
	long pages;

	/*
	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
	 * here avoids calling into writeback_inodes_wb() more than once.
	 *
	 * The intended call sequence for WB_SYNC_ALL writeback is:
	 *
	 *      wb_writeback()
	 *          writeback_sb_inodes()       <== called only once
	 *              write_cache_pages()     <== called once for each inode
	 *                   (quickly) tag currently dirty pages
	 *                   (maybe slowly) sync all tagged pages
	 */
	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
		pages = LONG_MAX;
	else {
		pages = min(wb->avg_write_bandwidth / 2,
			    global_wb_domain.dirty_limit / DIRTY_SCOPE);
		pages = min(pages, work->nr_pages);
		pages = round_down(pages + MIN_WRITEBACK_PAGES,
				   MIN_WRITEBACK_PAGES);
	}

	return pages;
}

range_cyclic

range_cyclic 早在内核的 v2.6.18-rc1 版本就已经实现,可以参考 111ebb6 的提交信息辅助理解

[PATCH] writeback: fix range handling
When a writeback_control's `start' and `end' fields are used to
indicate a one-byte-range starting at file offset zero, the required
values of .start=0,.end=0 mean that the ->writepages() implementation
has no way of telling that it is being asked to perform a range
request.  Because we're currently overloading (start == 0 && end == 0)
to mean "this is not a write-a-range request".

To make all this sane, the patch changes range of writeback_control.

So caller does: If it is calling ->writepages() to write pages, it
sets range (range_start/end or range_cyclic) always.

And if range_cyclic is true, ->writepages() thinks the range is
cyclic, otherwise it just uses range_start and range_end.

This patch does,

    - Add LLONG_MAX, LLONG_MIN, ULLONG_MAX to include/linux/kernel.h
      -1 is usually ok for range_end (type is long long). But, if someone did,

		range_end += val;		range_end is "val - 1"
		u64val = range_end >> bits;	u64val is "~(0ULL)"

      or something, they are wrong. So, this adds LLONG_MAX to avoid nasty
      things, and uses LLONG_MAX for range_end.

    - All callers of ->writepages() sets range_start/end or range_cyclic.

    - Fix updates of ->writeback_index. It seems already bit strange.
      If it starts at 0 and ended by check of nr_to_write, this last
      index may reduce chance to scan end of file.  So, this updates
      ->writeback_index only if range_cyclic is true or whole-file is
      scanned.

Signed-off-by: OGAWA Hirofumi <hirofumi@mail.parknet.co.jp>
Cc: Nathan Scott <nathans@sgi.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: Steven French <sfrench@us.ibm.com>
Cc: "Vladimir V. Saveliev" <vs@namesys.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

range_cyclicrange_start/end 互斥

  • 当开启 range_cyclic 将无视 range_start/end 的值
  • 否则底层 writepages 函数则使用 range_start/end 作为写的范围
do_writepages
mapping->a_ops->writepages()
generic_writepages
write_cache_pages

结合实际代码,在 mm/page-writeback.cwrite_cache_pages 函数中有以下片段

	if (wbc->range_cyclic) {
		writeback_index = mapping->writeback_index; /* prev offset */
		index = writeback_index;
		end = -1;
	} else {
		index = wbc->range_start >> PAGE_SHIFT;
		end = wbc->range_end >> PAGE_SHIFT;
		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
			range_whole = 1;
	}

	// 此处省略部分代码

	/*
	 * If we hit the last page and there is more work to be done: wrap
	 * back the index back to the start of the file for the next
	 * time we are called.
	 */
	if (wbc->range_cyclic && !done)
		done_index = 0;
	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
		mapping->writeback_index = done_index;

range_cyclic 开启后会使用 mapping->writeback_index 作为本次回写的起始地址,并会在完成本次回写流程(回写页数限制或者到达文件末尾)后更新 mapping->writeback_index

定期回写

定期回写的任务声明在 fs/fs-writeback.cwb_check_old_data_flush 函数中,而这个函数则是被 wb_workfn 调用

static long wb_check_old_data_flush(struct bdi_writeback *wb)
{
	unsigned long expired;
	long nr_pages;

	/*
	 * When set to zero, disable periodic writeback
	 */
	if (!dirty_writeback_interval)
		return 0;

	expired = wb->last_old_flush +
			msecs_to_jiffies(dirty_writeback_interval * 10);
	if (time_before(jiffies, expired))
		return 0;

	wb->last_old_flush = jiffies;
	nr_pages = get_nr_dirty_pages();

	if (nr_pages) {
		// 定期回写
		struct wb_writeback_work work = {
			.nr_pages	= nr_pages,
			.sync_mode	= WB_SYNC_NONE,
			.for_kupdate	= 1,
			.range_cyclic	= 1,
			.reason		= WB_REASON_PERIODIC,
		};

		return wb_writeback(wb, &work);
	}

	return 0;
}
通常情况
wb_wakeup
wb_workfn
wb_do_writeback
wb_check_old_data_flush

首先要保证当前时间在上次定期回写的 5s (和定期唤醒的时间间隔一致)后,并且当前存在脏页,才会生成一次定期回写的任务,也就是说每 5s 内最多触发一次定期回写

生成的回写任务交给 fs/fs-writeback.cwb_writeback 函数处理

并且定期回写属于一种后台回写,优先级较低,只有在 wb->work_list 为空时才会执行

wb_writeback 执行定期回写时只会选择在至脏时间在当前时间 30s 之前的 inode 的所有脏页进行回写

阈值回写

针对脏页率内核中有两个阈值,10% 和 20%

// mm/page-writeback.c

/*
 * Start background writeback (via writeback threads) at this percentage
 */
int dirty_background_ratio = 10;

/*
 * The generator of dirty data starts writeback at this percentage
 */
int vm_dirty_ratio = 20;
  1. bg_thresh 后台阈值

    当脏页率达到 10% 时会以后台的方式进行回写

    通常情况
    wb_over_bg_thresh
    generic_perform_write
    balance_dirty_pages_ratelimited
    balance_dirty_pages
    wb_start_background_writeback
    wb_wakeup
    wb_workfn
    wb_do_writeback
    wb_check_background_flush

    当用户 write 调用使用 generic_perform_write 来写 page cache 时,会调用 balance_dirty_pages_ratelimited 来检查脏页率,当脏页率超过 10% 就会调用 balance_dirty_pages 来唤醒 wb_workfn 来进行下刷脏页,此时并不会阻塞当前的 write 过程

  2. thresh 前台阈值

    而当脏页率达到 20% 之后则会触发前台回写,此时的函数调用和逻辑和上面基本一致,不同点在于当脏页率超过 20% 后会在 balance_dirty_pages 的循环中无法跳出,因此线程会阻塞,直到脏页率降低至 20% 以下,跳出循环,启用后台回写

手动触发回写

sync

sync 系统调用会同步所有的 page cache

bash 上直接输入 sync 命令就会触发 sync 系统调用

// fs/sync.c
/*
 * Sync everything. We start by waking flusher threads so that most of
 * writeback runs on all devices in parallel. Then we sync all inodes reliably
 * which effectively also waits for all flusher threads to finish doing
 * writeback. At this point all data is on disk so metadata should be stable
 * and we tell filesystems to sync their metadata via ->sync_fs() calls.
 * Finally, we writeout all block devices because some filesystems (e.g. ext2)
 * just write metadata (such as inodes or bitmaps) to block device page cache
 * and do not sync it on their own in ->sync_fs().
 */
void ksys_sync(void)
{
	int nowait = 0, wait = 1;

	// 唤醒所有 bdi 的回写线程
	wakeup_flusher_threads(WB_REASON_SYNC);
	// 下发所有 inode 的回写任务
	iterate_supers(sync_inodes_one_sb, NULL);
	// 调用 sync_fs() 同步文件系统的元数据
	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &wait);
	// 回写块设备的 page cache
	iterate_bdevs(fdatawrite_one_bdev, NULL);
	iterate_bdevs(fdatawait_one_bdev, NULL);
	if (unlikely(laptop_mode))
		laptop_sync_completion();
}

SYSCALL_DEFINE0(sync)
{
	ksys_sync();
	return 0;
}
  • 首先,唤醒所有设备的回写线程线程,这样大部分的回写在所有设备上并行运行
  • 并立刻生成一个下刷设备上所有 inode 的回写任务,并等待完成
  • 然后文件系统通过注册的 sync_fs() 调用来同步他们的元数据
  • 最后,回写所有的块设备的 page cache
    • 因为有些文件系统(例如 ext2)会将元数据(如 inodesbitmaps)写入块设备 page cache,而不是在 sync_fs() 中自行同步

fsyncfdatasync

fsyncfdatasync 系统调用则可以更加细粒度的下刷脏页,他们的下刷对象是一个文件

// fs/sync.c
/**
 * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:		file to sync
 * @start:		offset in bytes of the beginning of data range to sync
 * @end:		offset in bytes of the end of data range (inclusive)
 * @datasync:		perform only datasync
 *
 * Write back data in range @start..@end and metadata for @file to disk.  If
 * @datasync is set only metadata needed to access modified file data is
 * written.
 */
int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
{
	struct inode *inode = file->f_mapping->host;

	if (!file->f_op->fsync)
		return -EINVAL;
	if (!datasync && (inode->i_state & I_DIRTY_TIME))
		mark_inode_dirty_sync(inode);
	return file->f_op->fsync(file, start, end, datasync);
}
EXPORT_SYMBOL(vfs_fsync_range);

/**
 * vfs_fsync - perform a fsync or fdatasync on a file
 * @file:		file to sync
 * @datasync:		only perform a fdatasync operation
 *
 * Write back data and metadata for @file to disk.  If @datasync is
 * set only metadata needed to access modified file data is written.
 */
int vfs_fsync(struct file *file, int datasync)
{
	return vfs_fsync_range(file, 0, LLONG_MAX, datasync);
}
EXPORT_SYMBOL(vfs_fsync);

static int do_fsync(unsigned int fd, int datasync)
{
	struct fd f = fdget(fd);
	int ret = -EBADF;

	if (f.file) {
		ret = vfs_fsync(f.file, datasync);
		fdput(f);
	}
	return ret;
}

SYSCALL_DEFINE1(fsync, unsigned int, fd)
{
	return do_fsync(fd, 0);
}

SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
{
	return do_fsync(fd, 1);
}

fsyncfdatasync 会调用文件系统注册的 f_op->fsync() 函数进行脏页的下刷。很多文件系统会使用或者参考通用的 generic_file_fsync 来实现,这里针对 __generic_file_fsync 进行分析

// fs/libfs.c
/**
 * __generic_file_fsync - generic fsync implementation for simple filesystems
 *
 * @file:	file to synchronize
 * @start:	start offset in bytes
 * @end:	end offset in bytes (inclusive)
 * @datasync:	only synchronize essential metadata if true
 *
 * This is a generic implementation of the fsync method for simple
 * filesystems which track all non-inode metadata in the buffers list
 * hanging off the address_space structure.
 */
int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
				 int datasync)
{
	struct inode *inode = file->f_mapping->host;
	int err;
	int ret;

	err = file_write_and_wait_range(file, start, end);
	if (err)
		return err;

	inode_lock(inode);
	ret = sync_mapping_buffers(inode->i_mapping);
	if (!(inode->i_state & I_DIRTY_ALL))
		goto out;
	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
		goto out;

	err = sync_inode_metadata(inode, 1);
	if (ret == 0)
		ret = err;

out:
	inode_unlock(inode);
	/* check and advance again to catch errors after syncing out buffers */
	err = file_check_and_advance_wb_err(file);
	if (ret == 0)
		ret = err;
	return ret;
}
EXPORT_SYMBOL(__generic_file_fsync);

/**
 * generic_file_fsync - generic fsync implementation for simple filesystems
 *			with flush
 * @file:	file to synchronize
 * @start:	start offset in bytes
 * @end:	end offset in bytes (inclusive)
 * @datasync:	only synchronize essential metadata if true
 *
 */

int generic_file_fsync(struct file *file, loff_t start, loff_t end,
		       int datasync)
{
	struct inode *inode = file->f_mapping->host;
	int err;

	err = __generic_file_fsync(file, start, end, datasync);
	if (err)
		return err;
	return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
}
EXPORT_SYMBOL(generic_file_fsync);

无论 fsync 还是 fdatasync 都会调用 file_write_and_wait_range 下刷 page cache 中的脏页。而在 indoe 本身元数据只是时间戳是脏时,fdatasync 就会跳过 sync_inode_metadata,不将元数据一起下刷到底层设备上;fsync 则不会跳过元数据的下刷。

因此 fsync 至少需要两次 IO 写操作,开销比 fdatasync 更大

open 时带有 O_SYNC

如果在打开一个文件时带了 O_SYNC 标记,则会在写入 page cache 后,再次调用 vfs_fsync_range 将数据下刷到底层设备上

ext4,f2fs等文件系统
write系统调用
ksys_write
vfs_write
__vfs_write
new_sync_write

ext4f2fs 等文件系统 write 系统调用会使用 new_sync_write 来调用实际文件系统注册的 read_iter 函数

new_sync_write 调用先调用 iocb_flags 将用户配置的 O_SYNC 进行解析,为 struct kiocbki_flags 字段生成标记

new_sync_write
init_sync_kiocb
iocb_flags
iov_iter_init
call_write_iter

和之前一样,大多数文件系统会直接使用或者参考通用的 generic_file_write_iter 来实现 read_iter 函数,这里针对 generic_file_write_iter 进行分析

// mm/filemap.c
/**
 * generic_file_write_iter - write data to a file
 * @iocb:	IO state structure
 * @from:	iov_iter with data to write
 *
 * This is a wrapper around __generic_file_write_iter() to be used by most
 * filesystems. It takes care of syncing the file in case of O_SYNC file
 * and acquires i_mutex as needed.
 * Return:
 * * negative error code if no data has been written at all of
 *   vfs_fsync_range() failed for a synchronous write
 * * number of bytes written, even for truncated writes
 */
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
	struct file *file = iocb->ki_filp;
	struct inode *inode = file->f_mapping->host;
	ssize_t ret;

	inode_lock(inode);
	ret = generic_write_checks(iocb, from);
	if (ret > 0)
		ret = __generic_file_write_iter(iocb, from);
	inode_unlock(inode);

	if (ret > 0)
		ret = generic_write_sync(iocb, ret);
	return ret;
}
EXPORT_SYMBOL(generic_file_write_iter);

__generic_file_write_iter 完成之后,实际的数据已经被写入 page cache,之后会调用 generic_write_sync 会将刚刚写入 page cache 的数据通过 vfs_fsync_range 下刷到底层设备上

// include/linux/fs.h
/*
 * Sync the bytes written if this was a synchronous write.  Expect ki_pos
 * to already be updated for the write, and will return either the amount
 * of bytes passed in, or an error if syncing the file failed.
 */
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
{
	if (iocb->ki_flags & IOCB_DSYNC) {
		int ret = vfs_fsync_range(iocb->ki_filp,
				iocb->ki_pos - count, iocb->ki_pos - 1,
				(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
		if (ret)
			return ret;
	}

	return count;
}

本文作者: ywang_wnlo
本文链接: https://ywang-wnlo.github.io/posts/646202b9/
版权声明: 本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!

  • 2
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
WordPress静态化插件,感觉寻找这样插件的人不在少数,人们都说WordPress非常消耗资源,所以,希望有这样的文件。与 WP-Cache 相同,可以通过缓存来提高大负载下WordPress的性能。WP Super Cache 相比 WP-Cache 最大的改进,在于加入了 Super Cache 这一环节。部分页面会根据一定的规则被缓存为 SuperCache,而这个 SuperCache 实质上就是页面本身。通过 Apache mod_rewrite 重定向直接访问这些静态文件,理论上可以获得更好的性能。 WP-Super-Cache插件安装: 下载WP-Super-Cache插件,然后上传插件到/wp-content/plugins/文件夹。 到wp管理后台插件页面激活插件,激活后插件后,它会出现红色警告,这时需要您在wp-config.php文件里加入如下代码:define('WP_CACHE', true); 在后台的管理页面的Setting-->WP-Super-Cache,对这个插件进行相关的设定,进入这个页面时,在上方它会提示您,让您把wp-content的权限设置为755,设置好后,你就会看到如下图: 上图中三个选项,默认情况下,WP Super Cache 是没有开启的,所以在这里选择下,需按则第一个,就是 Super Cache缓存为静态文件,如果第二种情况呢,则和 WP-Cache 一致,除了图中的这些选项外,页面中还有其它选项,不能一一截图,简单介绍下: Proudly tell the world your server is Digg proof! (places a message in your blog’s footer) 这个是说在你的 footer 显示一条信息告诉读者该博客已经缓存了,一般不要这样做。 Clear all cache files when a post or page is published. (This may significantly slow down saving of posts.) 这个是说发布新日志的时候清理所有缓存,这个肯定不能这么干了。 Cache Contents 这里显示你博客缓存文件和过期文件的数量,你可以手工删除缓存文件和过期文件,因为 WP Super Cache 有垃圾回收机制,所以这里不帮不用手工删除,如果当你测试某个插件,看不到效果的时候,如果确定是缓存的原因,你可以到这里手工删除测试下。 Expiry Time and Garbage Collection 过期时间和垃圾收集,这里主要设置缓存文件过期时间,和多久删除一次过期的缓存文件。个人建议:一般缓存时间设置为1天(86400)左右,缓存删除时间为1个小时一次。 Accepted filenames, rejected URIs 这里设置一些链接不需要缓存,并且可以设置特例。 Rejected User Agents 这里设置某些 User Agents 来访问的时候不会缓存。比如 Google 和百度爬虫等。 Lock Down 一般情况下,某篇日志如果有了新留言,就会更新缓存文件,不过这篇日志是否已经到了缓存时间,但是在某种情况,你预料到你的博客可能会有很大流量和留言,候重复更新缓存文件可能并不是很好的选择,速度反而更慢,这个时候你就可以锁定缓存文件,新留言不再更新缓存日志。 Cache Plugins 缓存插件还支持附加插件,比如作者自己就提供一个插件:No Adverts for Friends,不对朋友显示广告。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值