Linux页高速缓存回写机制机制源码分析,基于内核3.8:Writeback

再转一个基于内核版本3.8的,这个版本与3.2的基本一致,这个版本写的比较详细,所以也转过来,供大家比较。

转自:Linux 3.8 Writeback机制源码分析

https://blog.csdn.net/bysun2013/article/details/29243573

writeback相关数据结构


与writeback相关的数据结构主要有:
backing_dev_info,该数据结构描述了backing_dev的所有信息,通常块设备的request queue中会包含backing_dev对象。bdi_writeback,该数据结构封装了writeback的内核线程以及需要操作的inode队列。wb_writeback_work,该数据结构封装了writeback的工作任务。


它们的结构体分别如下:


struct backing_dev_info {

	struct list_head bdi_list;

	unsigned long ra_pages;	/* max readahead in PAGE_CACHE_SIZE units */

	unsigned long state;	/* Always use atomic bitops on this */

	unsigned int capabilities; /* Device capabilities */

	congested_fn *congested_fn; /* Function pointer if device is md/dm */

	void *congested_data;	/* Pointer to aux data for congested func */

	char *name;

	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];

	unsigned long bw_time_stamp;	/* last time write bw is updated */

	unsigned long dirtied_stamp;

	unsigned long written_stamp;	/* pages written at bw_time_stamp */

	unsigned long write_bandwidth;	/* the estimated write bandwidth */

	unsigned long avg_write_bandwidth; /* further smoothed write bw */

	/*

	 * The base dirty throttle rate, re-calculated on every 200ms.

	 * All the bdi tasks' dirty rate will be curbed under it.

	 * @dirty_ratelimit tracks the estimated @balanced_dirty_ratelimit

	 * in small steps and is much more smooth/stable than the latter.

	 */

	unsigned long dirty_ratelimit;

	unsigned long balanced_dirty_ratelimit;

	struct fprop_local_percpu completions;

	int dirty_exceeded;

	unsigned int min_ratio;

	unsigned int max_ratio, max_prop_frac;

	struct bdi_writeback wb;  /* default writeback info for this bdi */

	spinlock_t wb_lock;	  /* protects work_list */

	struct list_head work_list;

	struct device *dev;

	struct timer_list laptop_mode_wb_timer;

#ifdef CONFIG_DEBUG_FS

	struct dentry *debug_dir;

	struct dentry *debug_stats;

#endif

};

struct bdi_writeback {

	struct backing_dev_info *bdi;	/* our parent bdi */

	unsigned int nr;

	unsigned long last_old_flush;	/* last old data flush */

	unsigned long last_active;	/* last time bdi thread was active */

	struct task_struct *task;	/* writeback thread */

	struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */

	struct list_head b_dirty;	/* dirty inodes */

	struct list_head b_io;		/* parked for writeback */

	struct list_head b_more_io;	/* parked for more writeback */

	spinlock_t list_lock;		/* protects the b_* lists */

};

/*

 * Passed into wb_writeback(), essentially a subset of writeback_control

 */

struct wb_writeback_work {

	long nr_pages;

	struct super_block *sb;

	unsigned long *older_than_this;

	enum writeback_sync_modes sync_mode;

	unsigned int tagged_writepages:1;

	unsigned int for_kupdate:1;

	unsigned int range_cyclic:1;

	unsigned int for_background:1;

	enum wb_reason reason;		/* why was writeback initiated? */

	struct list_head list;		/* pending work list */

	struct completion *done;	/* set if the caller waits */

};

1、BDI数据结构是对块设备的一个描述。bdi对象在块设备添加的时候需要注册到系统的bdi队列中。对于ext3而言,在mount的时候需要将底层块设备的bdi对象联系到ext3 root_inode中。在bdi数据结构中有一条work_list,该队列维护了writeback内核线程需要处理的任务。如果该队列上没有work可以处理,那么writeback内核线程将会睡眠等待。

2、writeback对象封装了内核线程task以及需要处理的inode队列。当page cache/buffer cache需要刷新radix tree上的inode时,可以将该inode挂载到writeback对象的b_dirty队列上,然后唤醒writeback线程。在处理过程中,inode会被移到b_io队列上进行处理。多条链表的方式可以降低多线程之间的资源共享。

3、wb_writeback_work数据结构是对writeback任务的封装,不同的任务可以采用不同的刷新策略。writeback线程的处理对象就是writeback_work。如果writeback_work队列为空,那么内核线程就可以睡眠了。

writeback主要函数分析

writeback机制的主要函数包括如下两个方面:

1、管理bdi对象并且fork相应的writeback内核线程处理cache数据的刷新工作。

2、writeback内核线程处理函数,实现dirty page的刷新操作

 

writeback线程管理


Linux中有一个内核守护线程,该线程用来管理系统bdi队列,并且负责为block device创建writeback thread。当bdi中有dirty page并且还没有为bdi分配内核线程的时候,bdi_forker_thread程序会为其分配线程资源;当一个writeback线程长时间(默认为5min)处于空闲状态时,bdi_forker_thread程序会释放该线程资源。


static int bdi_forker_thread(void *ptr)

{

	struct bdi_writeback *me = ptr;

	current->flags |= PF_SWAPWRITE;

	set_freezable();

	/*

	 * Our parent may run at a different priority, just set us to normal

	 */

	set_user_nice(current, 0);

	for (;;) {

		struct task_struct *task = NULL;

		struct backing_dev_info *bdi;

		enum {

			NO_ACTION,   /* Nothing to do */

			FORK_THREAD, /* Fork bdi thread */

			KILL_THREAD, /* Kill inactive bdi thread */

		} action = NO_ACTION;

		/*

		 * Temporary measure, we want to make sure we don't see

		 * dirty data on the default backing_dev_info

		 */

		if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {

			del_timer(&me->wakeup_timer);

			wb_do_writeback(me, 0);

		}


		spin_lock_bh(&bdi_lock);

		/*

		 * In the following loop we are going to check whether we have

		 * some work to do without any synchronization with tasks

		 * waking us up to do work for them. Set the task state here

		 * so that we don't miss wakeups after verifying conditions.

		 */

		set_current_state(TASK_INTERRUPTIBLE);

 

		list_for_each_entry(bdi, &bdi_list, bdi_list) {

			bool have_dirty_io;

 

			if (!bdi_cap_writeback_dirty(bdi) ||

			     bdi_cap_flush_forker(bdi))

				continue;

			WARN(!test_bit(BDI_registered, &bdi->state),

			     "bdi %p/%s is not registered!\n", bdi, bdi->name);

			have_dirty_io = !list_empty(&bdi->work_list) ||

					wb_has_dirty_io(&bdi->wb);

			/*

			 * If the bdi has work to do, but the thread does not

			 * exist - create it.

			 */

			if (!bdi->wb.task && have_dirty_io) {

				/*

				 * Set the pending bit - if someone will try to

				 * unregister this bdi - it'll wait on this bit.

				 */

				set_bit(BDI_pending, &bdi->state);

				action = FORK_THREAD;

				break;

			}

			spin_lock(&bdi->wb_lock);

			/*

			 * If there is no work to do and the bdi thread was

			 * inactive long enough - kill it. The wb_lock is taken

			 * to make sure no-one adds more work to this bdi and

			 * wakes the bdi thread up.

			 */

			if (bdi->wb.task && !have_dirty_io &&

			    time_after(jiffies, bdi->wb.last_active +

						bdi_longest_inactive())) {

				task = bdi->wb.task;

				bdi->wb.task = NULL;

				spin_unlock(&bdi->wb_lock);

				set_bit(BDI_pending, &bdi->state);

				action = KILL_THREAD;

				break;

			}

			spin_unlock(&bdi->wb_lock);

		}

		spin_unlock_bh(&bdi_lock);

		/* Keep working if default bdi still has things to do */

		if (!list_empty(&me->bdi->work_list))

			__set_current_state(TASK_RUNNING);

		switch (action) {

		case FORK_THREAD:

			__set_current_state(TASK_RUNNING);

			task = kthread_create(bdi_writeback_thread, &bdi->wb,

					      "flush-%s", dev_name(bdi->dev));

			if (IS_ERR(task)) {

				/*

				 * If thread creation fails, force writeout of

				 * the bdi from the thread. Hopefully 1024 is

				 * large enough for efficient IO.

				 */

				writeback_inodes_wb(&bdi->wb, 1024,

						    WB_REASON_FORKER_THREAD);

			} else {

				/*

				 * The spinlock makes sure we do not lose

				 * wake-ups when racing with 'bdi_queue_work()'.

				 * And as soon as the bdi thread is visible, we

				 * can start it.

				 */

				spin_lock_bh(&bdi->wb_lock);

				bdi->wb.task = task;

				spin_unlock_bh(&bdi->wb_lock);

				wake_up_process(task);

			}

			bdi_clear_pending(bdi);

			break;

		case KILL_THREAD:

			__set_current_state(TASK_RUNNING);

			kthread_stop(task);

			bdi_clear_pending(bdi);

			break;

		case NO_ACTION:

			if (!wb_has_dirty_io(me) || !dirty_writeback_interval)

				/*

				 * There are no dirty data. The only thing we

				 * should now care about is checking for

				 * inactive bdi threads and killing them. Thus,

				 * let's sleep for longer time, save energy and

				 * be friendly for battery-driven devices.

				 */

				schedule_timeout(bdi_longest_inactive());

			else

				schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));

			try_to_freeze();

			break;

		}

	}

	return 0;

}

Writeback工作线程

writeback线程是bdi_forker_thread 创建的,该线程的任务就是处理等待的数据回刷任务。线程处理函数为bdi_writeback_thread,该函数的实现如下:

/*

 * Handle writeback of dirty data for the device backed by this bdi. Also

 * wakes up periodically and does kupdated style flushing.

 */

int bdi_writeback_thread(void *data)

{

	struct bdi_writeback *wb = data;

	struct backing_dev_info *bdi = wb->bdi;

	long pages_written;

	current->flags |= PF_SWAPWRITE;

	set_freezable();

	wb->last_active = jiffies;

	/*

	 * Our parent may run at a different priority, just set us to normal

	 */

	set_user_nice(current, 0);

 

	trace_writeback_thread_start(bdi);

 

	while (!kthread_freezable_should_stop(NULL)) {

		/*

		 * Remove own delayed wake-up timer, since we are already awake

		 * and we'll take care of the periodic write-back.

		 */

		del_timer(&wb->wakeup_timer);


		pages_written = wb_do_writeback(wb, 0);

		trace_writeback_pages_written(pages_written);

		if (pages_written)

			wb->last_active = jiffies;

		set_current_state(TASK_INTERRUPTIBLE);

		if (!list_empty(&bdi->work_list) || kthread_should_stop()) {

			__set_current_state(TASK_RUNNING);

			continue;

		}

		if (wb_has_dirty_io(wb) && dirty_writeback_interval)

			schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));

		else {

			/*

			 * We have nothing to do, so can go sleep without any

			 * timeout and save power. When a work is queued or

			 * something is made dirty - we will be woken up.

			 */

			schedule();

		}

	}


	/* Flush any work that raced with us exiting */

	if (!list_empty(&bdi->work_list))

		wb_do_writeback(wb, 1);

	trace_writeback_thread_stop(bdi);

	return 0;

}

bdi_writeback_thread函数主要是调用wb_do_writeback()函数。

/*

 * Retrieve work items and do the writeback they describe

 */

long wb_do_writeback(struct bdi_writeback *wb, int force_wait)

{

	struct backing_dev_info *bdi = wb->bdi;

	struct wb_writeback_work *work;

	long wrote = 0;

	set_bit(BDI_writeback_running, &wb->bdi->state);

	while ((work = get_next_work_item(bdi)) != NULL) {

		/*

		 * Override sync mode, in case we must wait for completion

		 * because this thread is exiting now.

		 */

		if (force_wait)

			work->sync_mode = WB_SYNC_ALL;


		trace_writeback_exec(bdi, work);

		wrote += wb_writeback(wb, work);

		/*

		 * Notify the caller of completion if this is a synchronous

		 * work item, otherwise just free it.

		 */

		if (work->done)

			complete(work->done);

		else

			kfree(work);

	}


	/*

	 * Check for periodic writeback, kupdated() style

	 */

	wrote += wb_check_old_data_flush(wb);

	wrote += wb_check_background_flush(wb);

	clear_bit(BDI_writeback_running, &wb->bdi->state);

	return wrote;

}

wb_check_old_data_flush函数的主要功能是周期性的检查脏页并写回,它默认写回30s之前写入的脏页,每隔5s扫描一次。


static long wb_check_old_data_flush(struct bdi_writeback *wb)

{

	unsigned long expired;

	long nr_pages;


	/*

	 * When set to zero, disable periodic writeback

	 */

	if (!dirty_writeback_interval)

		return 0;

	expired = wb->last_old_flush +

			msecs_to_jiffies(dirty_writeback_interval * 10);

	if (time_before(jiffies, expired))

		return 0;

	wb->last_old_flush = jiffies;

	nr_pages = get_nr_dirty_pages();

	if (nr_pages) {

		struct wb_writeback_work work = {

			.nr_pages	= nr_pages,

			.sync_mode	= WB_SYNC_NONE,

			.for_kupdate	= 1,

			.range_cyclic	= 1,

			.reason		= WB_REASON_PERIODIC,

		};

		return wb_writeback(wb, &work);

	}

	return 0;

}

wb_check_background_flush的功能是在脏页达到一定比例时写回所有的脏页,直到脏页的比例达到阀值以下。


static long wb_check_background_flush(struct bdi_writeback *wb)

{

	if (over_bground_thresh(wb->bdi)) {

		struct wb_writeback_work work = {

			.nr_pages	= LONG_MAX,

			.sync_mode	= WB_SYNC_NONE,

			.for_background	= 1,

			.range_cyclic	= 1,

			.reason		= WB_REASON_BACKGROUND,

		};

		return wb_writeback(wb, &work);

	}


	return 0;

}

wb_check_background_flush和wb_check_old_data_flush的函数只是设置wb_writeback_work的各项参数,然后执行wb_writeback函数,该函数是Writeback机制中真正执行写回的函数。Writeback机制中的写回磁盘操作都是通过wb_writeback函数实现的,wb_writeback调用与文件系统有关的write函数,执行协会磁盘的操作。


/*

 * Explicit flushing or periodic writeback of "old" data.

 *

 * Define "old": the first time one of an inode's pages is dirtied, we mark the

 * dirtying-time in the inode's address_space.  So this periodic writeback code

 * just walks the superblock inode list, writing back any inodes which are

 * older than a specific point in time.

 *

 * Try to run once per dirty_writeback_interval.  But if a writeback event

 * takes longer than a dirty_writeback_interval interval, then leave a

 * one-second gap.

 *

 * older_than_this takes precedence over nr_to_write.  So we'll only write back

 * all dirty pages if they are all attached to "old" mappings.

 */

static long wb_writeback(struct bdi_writeback *wb,

			 struct wb_writeback_work *work)

{

	unsigned long wb_start = jiffies;

	long nr_pages = work->nr_pages;

	unsigned long oldest_jif;

	struct inode *inode;

	long progress;

	oldest_jif = jiffies;

	work->older_than_this = &oldest_jif;

	spin_lock(&wb->list_lock);

	for (;;) {

		/*

		 * Stop writeback when nr_pages has been consumed

		 */

		if (work->nr_pages <= 0)

			break;


		/*

		 * Background writeout and kupdate-style writeback may

		 * run forever. Stop them if there is other work to do

		 * so that e.g. sync can proceed. They'll be restarted

		 * after the other works are all done.

		 */

		if ((work->for_background || work->for_kupdate) &&

		    !list_empty(&wb->bdi->work_list))

			break;

		/*

		 * For background writeout, stop when we are below the

		 * background dirty threshold

		 */

		if (work->for_background && !over_bground_thresh(wb->bdi))

			break;

 

		/*

		 * Kupdate and background works are special and we want to

		 * include all inodes that need writing. Livelock avoidance is

		 * handled by these works yielding to any other work so we are

		 * safe.

		 */

		if (work->for_kupdate) {

			oldest_jif = jiffies -

				msecs_to_jiffies(dirty_expire_interval * 10);

		} else if (work->for_background)

			oldest_jif = jiffies;

 

		trace_writeback_start(wb->bdi, work);

		if (list_empty(&wb->b_io))

			queue_io(wb, work);

		if (work->sb)

			progress = writeback_sb_inodes(work->sb, wb, work);

		else

			progress = __writeback_inodes_wb(wb, work);

		trace_writeback_written(wb->bdi, work);

		wb_update_bandwidth(wb, wb_start);


		/*

		 * Did we write something? Try for more

		 *

		 * Dirty inodes are moved to b_io for writeback in batches.

		 * The completion of the current batch does not necessarily

		 * mean the overall work is done. So we keep looping as long

		 * as made some progress on cleaning pages or inodes.

		 */

		if (progress)

			continue;

		/*

		 * No more inodes for IO, bail

		 */

		if (list_empty(&wb->b_more_io))

			break;

		/*

		 * Nothing written. Wait for some inode to

		 * become available for writeback. Otherwise

		 * we'll just busyloop.

		 */

		if (!list_empty(&wb->b_more_io))  {

			trace_writeback_wait(wb->bdi, work);

			inode = wb_inode(wb->b_more_io.prev);

			spin_lock(&inode->i_lock);

			spin_unlock(&wb->list_lock);

			/* This function drops i_lock... */

			inode_sleep_on_writeback(inode);

			spin_lock(&wb->list_lock);

		}

	}

	spin_unlock(&wb->list_lock);

	return nr_pages - work->nr_pages;

}

总结

writeback机制是比较简单的,其核心是通过一个常驻内核线程为每个BDI对象分配writeback线程,实现对cache中dirty page的数据回刷。

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值