Linux SYNC-fs

sync同步buffer和cache数据到disk设备。

使用情景

1、Linux命令行下执行sync命令

2、调用库函数sync

3、Linux命令行下执行reboot或poweroff会走sync,-n参数Do not sync

代码分析

sync系统调用

SYSCALL_DEFINE0(sync)
{
	int nowait = 0, wait = 1;

	wakeup_flusher_threads(0, WB_REASON_SYNC); 唤醒flusher刷新线程进行同步
	iterate_supers(sync_inodes_one_sb, NULL);
	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &wait);
	iterate_bdevs(fdatawrite_one_bdev, NULL);
	iterate_bdevs(fdatawait_one_bdev, NULL);
	if (unlikely(laptop_mode))
		laptop_sync_completion();
	return 0;
}

唤醒刷新队列线程

wakeup_flusher_threads(long nr_pages, enum wb_reason reason),nr_pages为0表示需要回写cache里面所有的数据,会把cache里面所有的脏页找出,并加入到回写线程中。

/*
 * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
 * the whole world.
 */
void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
{
	struct backing_dev_info *bdi;

	if (!nr_pages)
		nr_pages = get_nr_dirty_pages(); 获取page cache中所有的脏页

	rcu_read_lock();
	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
		if (!bdi_has_dirty_io(bdi))
			continue;
		__bdi_start_writeback(bdi, nr_pages, false, reason); 启动回写队列函数
	}
	rcu_read_unlock();
}

启动回写队列函数。

static void __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
		      bool range_cyclic, enum wb_reason reason)
{
	struct wb_writeback_work *work;

	/*
	 * This is WB_SYNC_NONE writeback, so if allocation fails just
	 * wakeup the thread for old dirty data writeback
	 */
	work = kzalloc(sizeof(*work), GFP_ATOMIC);
	if (!work) {
		trace_writeback_nowork(bdi);
		bdi_wakeup_thread(bdi);
		return;
	}

	work->sync_mode	= WB_SYNC_NONE;
	work->nr_pages	= nr_pages;
	work->range_cyclic = range_cyclic;
	work->reason	= reason;

	bdi_queue_work(bdi, work);
}

创建并初始化一个新的回写队列函数,最后加入到work list中。

同步系统中所有文件系统下面的文件

查找系统中所有文件系统并执行f函数:

void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
{
	struct super_block *sb, *p = NULL;

	spin_lock(&sb_lock);
	list_for_each_entry(sb, &super_blocks, s_list) {
		if (hlist_unhashed(&sb->s_instances))
			continue;
		sb->s_count++;
		spin_unlock(&sb_lock);

		down_read(&sb->s_umount);
		if (sb->s_root && (sb->s_flags & MS_BORN))
			f(sb, arg);

一般情况下更新超级块有几种情况:sync命令,umount命令,mount命令。

static void sync_inodes_one_sb(struct super_block *sb, void *arg)
{
	if (!(sb->s_flags & MS_RDONLY))
		sync_inodes_sb(sb);
}

创建一个队列并加入work list,然后就去等待文件系统中文件被刷新完成: 

/**
 * sync_inodes_sb	-	sync sb inode pages
 * @sb: the superblock
 *
 * This function writes and waits on any dirty inode belonging to this
 * super_block.
 */
void sync_inodes_sb(struct super_block *sb)
{
	DECLARE_COMPLETION_ONSTACK(done);
	struct wb_writeback_work work = {
		.sb		= sb,
		.sync_mode	= WB_SYNC_ALL,
		.nr_pages	= LONG_MAX,
		.range_cyclic	= 0,
		.done		= &done,
		.reason		= WB_REASON_SYNC,
		.for_sync	= 1,
	};

	/* Nothing to do? */
	if (sb->s_bdi == &noop_backing_dev_info)
		return;
	WARN_ON(!rwsem_is_locked(&sb->s_umount));

	bdi_queue_work(sb->s_bdi, &work);
	wait_for_completion(&done);

	wait_sb_inodes(sb);
}

sb表示超级块也就是一个类型的文件系统,inode表示一个文件,文件的数据块离散的地址使用i_mapping表示:

static void wait_sb_inodes(struct super_block *sb)
{
	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { 搜索文件系统中所有文件
		struct address_space *mapping = inode->i_mapping; 文件的原始数据块地址空间

		spin_lock(&inode->i_lock);
		if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) ||
		    (mapping->nrpages == 0)) {
			spin_unlock(&inode->i_lock);
			continue;
		}

		filemap_fdatawait(mapping); 等待cache中的数据块被刷新完成
                    filemap_fdatawait_range

 filemap_fdatawait_range函数就是搜索具体某个文件所有在cache中的数据是否被刷新完成,通过mapping把所有的pages全部找出来,然后通过wait_on_page_writeback函数检查每一个page是否被刷新完成,完成则被唤醒继续检查下一个page,否则进入不可中断的D状态等待被唤醒。

int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
	struct pagevec pvec;
	int nr_pages;
	int ret2, ret = 0;

	if (end_byte < start_byte)
		goto out;

	pagevec_init(&pvec, 0);
	while ((index <= end) &&
			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			PAGECACHE_TAG_WRITEBACK,
			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		unsigned i;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/* until radix tree lookup accepts end_index */
			if (page->index > end)
				continue;

			wait_on_page_writeback(page);
			if (TestClearPageError(page))
				ret = -EIO;
		}
		pagevec_release(&pvec);
		cond_resched();
	}
out:
	ret2 = filemap_check_errors(mapping);
	if (!ret)
		ret = ret2;

	return ret;
}

先分析到这儿,wait_on_page_writeback函数今天io_schedule暂不分析。

wait_sb_inodes(sb);查找superblock中所有已经使用过的inode对应的mapping数据块是否被page队列刷新完成,刷新完成则被队列唤醒,否则进入不可中断的D状态进行阻塞等待。

多进程sync测试

开6个进程分别写16M文件,写完文件后支持sync同步,验证当中sync进行时又有进程往cache里面写数据,会导致sync阻塞卡住时间很长,且一个sync卡住,其他sync都会卡住。

进程脚本:n表示进程index,a表示文件名递增;

n=1;a=1;while true;do dd if=/dev/zero of=$n-$a.img bs=1M count=16 2>/dev/null;sync;echo $n="$a";a=`expr $a + 1`;done &

ps查看进行状态信息:

31118 root      2732 D    sync
31119 root      2732 D    sync
31122 root      2732 D    sync
31123 root      2732 D    sync
31126 root      2732 D    sync
31131 root      2732 D    sync

13298 root      2732 D    sync
13302 root      2732 D    sync
13306 root      2732 D    sync
13314 root      2732 D    sync
13320 root      2732 D    sync
13576 root      2732 D    sync

实验总结

只要一个sync卡住,后面的都会卡住,第一个sync lock了cache里面的inodes,后面的sync会阻塞等待进行释放并检查回写,

每个线程或进程写完文件后马上调用sync同步cache数据,在单进程情况下不会影响太大,在多进程情况下,写入速度会很慢,且会阻塞其他进程。

同步文件系统数据

	iterate_supers(sync_fs_one_sb, &nowait);
	iterate_supers(sync_fs_one_sb, &wait);
static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
	if (!(sb->s_flags & MS_RDONLY) && sb->s_op->sync_fs) 不是只读文件系统且有同步函数
		sb->s_op->sync_fs(sb, *(int *)arg);
}

ext4文件的sync_fs赋值:

static const struct super_operations ext4_sops = {
	.sync_fs	= ext4_sync_fs,
};
static int ext4_sync_fs(struct super_block *sb, int wait)
{
	int ret = 0;
	tid_t target;
	bool needs_barrier = false;
	struct ext4_sb_info *sbi = EXT4_SB(sb);

	trace_ext4_sync_fs(sb, wait);
	flush_workqueue(sbi->rsv_conversion_wq);
	/*
	 * Writeback quota in non-journalled quota case - journalled quota has
	 * no dirty dquots
	 */
	dquot_writeback_dquots(sb, -1);
	/*
	 * Data writeback is possible w/o journal transaction, so barrier must
	 * being sent at the end of the function. But we can skip it if
	 * transaction_commit will do it for us.
	 */
	if (sbi->s_journal) {
		target = jbd2_get_latest_transaction(sbi->s_journal);
		if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
		    !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
			needs_barrier = true;

		if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
			if (wait)
				ret = jbd2_log_wait_commit(sbi->s_journal,
							   target);
		}
	} else if (wait && test_opt(sb, BARRIER))
		needs_barrier = true;
	if (needs_barrier) {
		int err;
		err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
		if (!ret)
			ret = err;
	}

	return ret;
}

上面函数判断是否有日志功能,有日志功能的话就启动jbd2开始记录日志,且为commit状态。日志就是记录元数据和数据的写顺序,数据完成后为什么状态,然后删除日志记录。如果数据没有写完断电情况下重启时会把日志文件回放进行数据的恢复。

回写缓存数据

刷新系统中所以文件系统文件的缓存数据,找出page cache中的数据地址i_mapping。

void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
{
	struct inode *inode, *old_inode = NULL;

	spin_lock(&inode_sb_list_lock);
	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
		struct address_space *mapping = inode->i_mapping;

		spin_lock(&inode->i_lock);
		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
		    mapping->nrpages == 0) {
			spin_unlock(&inode->i_lock);
			continue;
		}
		__iget(inode);
		spin_unlock(&inode->i_lock);
		spin_unlock(&inode_sb_list_lock);
		/*
		 * We hold a reference to 'inode' so it couldn't have been
		 * removed from s_inodes list while we dropped the
		 * inode_sb_list_lock.  We cannot iput the inode now as we can
		 * be holding the last reference and we cannot iput it under
		 * inode_sb_list_lock. So we keep the reference and iput it
		 * later.
		 */
		iput(old_inode);
		old_inode = inode;

		func(I_BDEV(inode), arg);

		spin_lock(&inode_sb_list_lock);
	}
	spin_unlock(&inode_sb_list_lock);
	iput(old_inode);
}
static void fdatawrite_one_bdev(struct block_device *bdev, void *arg)
{
	filemap_fdatawrite(bdev->bd_inode->i_mapping);
}
int filemap_fdatawrite(struct address_space *mapping)
{
	return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
}
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
				loff_t end, int sync_mode)
{
	int ret;
	struct writeback_control wbc = {
		.sync_mode = sync_mode,
		.nr_to_write = LONG_MAX,
		.range_start = start,
		.range_end = end,
	};

	if (!mapping_cap_writeback_dirty(mapping))
		return 0;

	ret = do_writepages(mapping, &wbc);
	return ret;
}
int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
	int ret;

	if (wbc->nr_to_write <= 0)
		return 0;
	if (mapping->a_ops->writepages)
		ret = mapping->a_ops->writepages(mapping, wbc);
	else
		ret = generic_writepages(mapping, wbc);
	return ret;
}

ext4_da_writepages 会先调用,后面又会进入到这里调用generic_writepages函数,最终是调用__writepage函数。

int generic_writepages(struct address_space *mapping,
		       struct writeback_control *wbc)
{
	struct blk_plug plug;
	int ret;

	/* deal with chardevs and other special file */
	if (!mapping->a_ops->writepage)
		return 0;

	blk_start_plug(&plug);
	ret = write_cache_pages(mapping, wbc, __writepage, mapping);
	blk_finish_plug(&plug);
	return ret;
}

分析write_cache_pages函数,

int write_cache_pages(struct address_space *mapping,
		      struct writeback_control *wbc, writepage_t writepage,
		      void *data)
{
	while (!done && (index <= end)) {
		int i;

		nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
			      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
		if (nr_pages == 0)
			break;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			ret = (*writepage)(page, wbc, data);

最后调用writepage函数,.writepage        = ext4_writepage,

static int __writepage(struct page *page, struct writeback_control *wbc,
		       void *data)
{
	struct address_space *mapping = data;
	int ret = mapping->a_ops->writepage(page, wbc);
	mapping_set_error(mapping, ret);
	return ret;
}

写脏页到磁盘

ext4_writepage
	ext4_bio_write_page
		io_submit_add_bh
			io_submit_init
				bio->bi_end_io = ext4_end_bio
	ext4_io_submit
		submit_bio		//提交一个bio到块设备层,可看到总page个数io_bio+=(count/8);
			generic_make_request
				q->make_request_fn(q, bio) //blk_queue_bio
					get_request
					init_request_from_bio
					plug = current->plug;
					__blk_run_queue
						q->request_fn(q) == scsi_request_fn	提交request到SCSI层,提交的page个数io_in+=(scsi_bufflen(cmd)/4096);
							scsi_init_cmd_errh
							scsi_dispatch_cmd
								cmd->scsi_done = scsi_done;
								host->hostt->queuecommand(host, cmd);
ext4_da_writepages
	mpage_da_map_and_submit
		mpage_da_submit_io
			ext4_io_submit
				submit_bio
					generic_make_request
						q->make_request_fn(q, bio);
							blk_queue_make_request(q, blk_queue_bio);
							q->make_request_fn = mfn;
								blk_queue_bio
									blk_queue_bounce
									get_request
									init_request_from_bio
									__blk_run_queue(q);
										__blk_run_queue_uncond(q);
											q->request_fn(q);== scsi_request_fn

上面就把fs这一层分析完成,接下来开始看block层的submit_bio函数。

等待缓存数据

static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
{
	filemap_fdatawait(bdev->bd_inode->i_mapping);
}
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
			    loff_t end_byte)
{
	pgoff_t index = start_byte >> PAGE_CACHE_SHIFT;
	pgoff_t end = end_byte >> PAGE_CACHE_SHIFT;
	struct pagevec pvec;
	int nr_pages;
	int ret2, ret = 0;

	if (end_byte < start_byte)
		goto out;
	printk(KERN_ERR "wait,in\n");
	pagevec_init(&pvec, 0);
	while ((index <= end) &&
			(nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
			PAGECACHE_TAG_WRITEBACK,
			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) {
		unsigned i;

		for (i = 0; i < nr_pages; i++) {
			struct page *page = pvec.pages[i];

			/* until radix tree lookup accepts end_index */
			if (page->index > end)
				continue;

			wait_on_page_writeback(page); 进入D状态等待回写完成
			if (TestClearPageError(page))
				ret = -EIO;
		}
		pagevec_release(&pvec);
		cond_resched();
	}
out:
	ret2 = filemap_check_errors(mapping);
	if (!ret)
		ret = ret2;
	printk(KERN_ERR "wait,out\n");

	return ret;
}
static inline void wait_on_page_writeback(struct page *page)
{
	if (PageWriteback(page))
		wait_on_page_bit(page, PG_writeback);
}
void wait_on_page_bit(struct page *page, int bit_nr)
{
	DEFINE_WAIT_BIT(wait, &page->flags, bit_nr);

	if (test_bit(bit_nr, &page->flags))
		__wait_on_bit(page_waitqueue(page), &wait, bit_wait_io,
							TASK_UNINTERRUPTIBLE);
}

上面函数进入D状态。

 

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值