f2fs write_checkpoint 过程分析

最新推荐文章于 2024-06-07 17:30:11 发布

东大坡居士

最新推荐文章于 2024-06-07 17:30:11 发布

阅读量1.1k

点赞数 3

分类专栏： linux 文件系统 f2fs 文章标签： linux 文件系统 f2fs writecheckpoin

本文链接：https://blog.csdn.net/tianweishuiguo/article/details/102637531

版权

linux 文件系统 f2fs 专栏收录该内容

9 篇文章 4 订阅

订阅专栏

write_checkpoint 主要负责把 cache中dirty的数据写回到磁盘中，在gc, trim, discard或者recovery的时候都会调用到。

int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
	unsigned long long ckpt_ver;
	int err = 0;

	mutex_lock(&sbi->cp_mutex);

	if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
		((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) ||
		((cpc->reason & CP_DISCARD) && !sbi->discard_blks)))
		goto out;
	if (unlikely(f2fs_cp_error(sbi))) {
		err = -EIO;
		goto out;
	}
	if (f2fs_readonly(sbi->sb)) {
		err = -EROFS;
		goto out;
	}

先看一下，传入函数的参数有两个， f2fs_sb_info *sbi，与cp_control * cpc:

1) f2fs_sb_info * sbi: f2fs super block

2) struct cp_control *cpc: check point 控制结构体, 里面有对check point操作的参数，cp_reason值可以为CP_RECOVERY, CP_DISCARD, CP_TRIMMED, CP_SYNC, CP_UMOUNT等，表示在何种场景进行的check point操作。

上段代码，首先判断，如果checkpoint未dirty，但是cp_reason为cp_fastboot，或者为cp_sync，或者cp_reason为cp_discard，但是discard blocks个数为0，直接退出，不做任何操作。

接下来判断，check point是否有错误，如果有直接退出，f2fs是否为只读的，如果是直接退出。

	err = block_operations(sbi);
	if (err)
		goto out;

block_operations函数作用是将所有将所有FS操作都冻结住，为了做checkpoint（Freeze all the FS-operations for checkpoint), 我们看看具体是怎样冻结住的。

static int block_operations(struct f2fs_sb_info *sbi)
{
	struct writeback_control wbc = {
		.sync_mode = WB_SYNC_ALL,
		.nr_to_write = LONG_MAX,
		.for_reclaim = 0,
	};
	struct blk_plug plug;
	int err = 0;

	blk_start_plug(&plug);

retry_flush_dents:
	f2fs_lock_all(sbi);
	/* write all the dirty dentry pages */
	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
		f2fs_unlock_all(sbi);
		err = sync_dirty_inodes(sbi, DIR_INODE);
		if (err)
			goto out;
		cond_resched();
		goto retry_flush_dents;
	}

首先将所有dentry相关的ditry pages同步写回，这个写回过程要先进行f2fs_lock_all(sbi)操作，我们发现，此过程结束的条件是无F2FS_DIRTY_DENTS, 但是结束时并没有释放锁，即没有f2fs_unlock_all(sbi).

/*
	 * POR: we should ensure that there are no dirty node pages
	 * until finishing nat/sit flush. inode->i_blocks can be updated.
	 */
	down_write(&sbi->node_change);

	if (get_pages(sbi, F2FS_DIRTY_IMETA)) {
		up_write(&sbi->node_change);
		f2fs_unlock_all(sbi);
		err = f2fs_sync_inode_meta(sbi);
		if (err)
			goto out;
		cond_resched();
		goto retry_flush_dents;
	}

接下来，又对所有的dirty inode pages进行sync写回操作，同样的过程，最后退出时也没有进行f2fs_unlock_all(sbi),执行到这里，还占据着f2fs_lock_all锁。

retry_flush_nodes:
	down_write(&sbi->node_write);

	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
		up_write(&sbi->node_write);
		err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO);
		if (err) {
			up_write(&sbi->node_change);
			f2fs_unlock_all(sbi);
			goto out;
		}
		cond_resched();
		goto retry_flush_nodes;
	}

最后，对所有的dirty node pages做sync操作，执行到最后，占据着两个锁, 一个是f2fs_lock_all锁，一个是node_write锁。

static inline void f2fs_lock_all(struct f2fs_sb_info *sbi)
{
	down_write(&sbi->cp_rwsem);
}

f2fs_lock_all操作的是sbi->cp_rwsem，所有fs相关的操作，都需要先获得这个信号量，对node block的操作也要获得node_write信号量，如果这两个在此时没有被释放，则其它的路径无法进行相关的操作，这就实现了block的功能。

	/* this is the case of multiple fstrims without any changes */
	if (cpc->reason & CP_DISCARD) {
		if (!exist_trim_candidates(sbi, cpc)) {
			unblock_operations(sbi);
			goto out;
		}

		if (NM_I(sbi)->dirty_nat_cnt == 0 &&
				SIT_I(sbi)->dirty_sentries == 0 &&
				prefree_segments(sbi) == 0) {
			flush_sit_entries(sbi, cpc);
			clear_prefree_segments(sbi, cpc);
			unblock_operations(sbi);
			goto out;
		}
	}

回到f2fs write_checkpoint过程，blockoperation之后，判断cpc_reason是否为CP_DISCARD（是否执行trim操作），如果是的话，判断是否有trim candidates，如果没有，则unlock_operations，即把f2fs_lock_all以及node_write信号量释放，退出。如果有trim candidates, 则判断如果dirty_nat_cnt，dirty_sentries,prefree_segment都为0的话，执行flush_sit_entries并释放信号量，后面详细描述flush_sit_entries.

	 * update checkpoint pack index
	 * Increase the version number so that
	 * SIT entries and seg summaries are written at correct place
	 */
	ckpt_ver = cur_cp_version(ckpt);
	ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver);

check point version ++

	/* write cached NAT/SIT entries to NAT/SIT area */
	flush_nat_entries(sbi, cpc);

接下来是一个重要的函数，flush_at_entries，将cache中的所有nat/sit entries写入f2fs nat/sit area,我们看一下具体流程。

void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
	struct f2fs_nm_info *nm_i = NM_I(sbi);
	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
	struct f2fs_journal *journal = curseg->journal;
	struct nat_entry_set *setvec[SETVEC_SIZE];
	struct nat_entry_set *set, *tmp;
	unsigned int found;
	nid_t set_idx = 0;
	LIST_HEAD(sets);

	if (!nm_i->dirty_nat_cnt)
		return;

	down_write(&nm_i->nat_tree_lock);

	/*
	 * if there are no enough space in journal to store dirty nat
	 * entries, remove all entries from journal and merge them
	 * into nat entry set.
	 */
	if (enabled_nat_bits(sbi, cpc) ||
		!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL))
		remove_nats_in_journal(sbi);

flush_nat_entries首先判断，如果journal中没有足够的space来存储dirty nat entries，则将journal中所有的entries删除并将他们merge到nat entry set中。

static inline bool __has_cursum_space(struct f2fs_journal *journal,
							int size, int type)
{
	if (type == NAT_JOURNAL)
		return size <= MAX_NAT_JENTRIES(journal);
	return size <= MAX_SIT_JENTRIES(journal);
}

__has_cursum_space函数判断journal空闲space是否大于dirty_nat_cnt，如果小于, 则调用remve_nats_in_journal，将journal中的所有nat entries删除。看下remove_nats_in_journal函数。

static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
{
	struct f2fs_nm_info *nm_i = NM_I(sbi);
	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
	struct f2fs_journal *journal = curseg->journal;
	int i;

	down_write(&curseg->journal_rwsem);
	//遍历journal中所有的nat entries
	for (i = 0; i < nats_in_cursum(journal); i++) {
		struct nat_entry *ne;
		struct f2fs_nat_entry raw_ne;
		nid_t nid = le32_to_cpu(nid_in_journal(journal, i));

        //得到journal中的f2fs_nat_entry结构
		raw_ne = nat_in_journal(journal, i);
        //判断nat cache中是否包含此nid nat entry数据
		ne = __lookup_nat_cache(nm_i, nid);
		if (!ne) {//如果nat cache中不包含此nid相关数据 ， 则新申请nat entry结构
			ne = __alloc_nat_entry(nid, true);
			//将新申请的nat entry结构加入nat_root缓存中
			//并将新申请的nat entry中入nat_entries
			__init_nat_entry(nm_i, ne, &raw_ne, true);
		}

		/*
		 * if a free nat in journal has not been used after last
		 * checkpoint, we should remove it from available nids,
		 * since later we will add it again.
		 */
		if (!get_nat_flag(ne, IS_DIRTY) &&
				le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
			spin_lock(&nm_i->nid_list_lock);
			nm_i->available_nids--;
			spin_unlock(&nm_i->nid_list_lock);
		}

		__set_nat_cache_dirty(nm_i, ne);
	}
	update_nats_in_cursum(journal, -i);
	up_write(&curseg->journal_rwsem);
}

remove_nats_in_journal()进行删除journal中的nat entries操作，它遍历journal中的每一个nat entriy, 对每一个nat entry执行__set_nat_cache_dirty(nm_i, ne)，具体的删除操作也是由此函数完成的，看__set_nat_cache_dirty做了哪些事情。

static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
						struct nat_entry *ne)
{
	nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid);
	struct nat_entry_set *head;

    //首先在nat_set_root缓存中查找是否包含此set
	head = radix_tree_lookup(&nm_i->nat_set_root, set);
	if (!head) {
		//如果不包含，则新申请一个nat_entry_set结构
		head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS);
        //初始化新申请的nat_entry_set
		INIT_LIST_HEAD(&head->entry_list);
		INIT_LIST_HEAD(&head->set_list);
		head->set = set;
		head->entry_cnt = 0;
		//将新申请的nat_entry_set插入radix树缓存中
		f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head);
	}

	if (get_nat_flag(ne, IS_DIRTY))
		goto refresh_list;

	nm_i->dirty_nat_cnt++;
	head->entry_cnt++;
	set_nat_flag(ne, IS_DIRTY, true);
refresh_list:
	//
	if (nat_get_blkaddr(ne) == NEW_ADDR)
		list_del_init(&ne->list);
	else
		list_move_tail(&ne->list, &head->entry_list);
}

__set_nat_cache_dirty主要做的事情，在nat_set_root tree中查找，是否包含相应的set，如果不包含，则新申请一个nat_entry_set，初始化并加入nat_set_root tree中。最后将此entry从原来的链表中删除，并移动到新申请的nat_entry_set链表中。经过这个操作后，journal 中所有的nat entries都移动到了nat_set_root 树中，并且具有相同nid的nat entry，链接到相同的nat_entry_set中（这里面有一处，如果nat enry的 block address 地址为NEW_ADDR，则只是将其从原来的list中删除，说明此nat entry没有有效的磁盘存储空间，也就不需要进行后续的flush操作？）。

从journal中删除所有的nat entries后，所有的nat entry都移到了nat set中，接下来有一个排序的过程，按照每个nat set中包含的nat entry数量的多少，时行排序，最后都存储到sets中。

接下来回到flush_nat_entries中，此函数最后的操作，就是把遍历所有的entry set, 把每个entry set中的的dirty nat entries flush, 写回磁盘中，具体看一下操作步骤。


static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
		struct nat_entry_set *set, struct cp_control *cpc)
{
	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
	struct f2fs_journal *journal = curseg->journal;
	nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK;
	bool to_journal = true;
	struct f2fs_nat_block *nat_blk;
	struct nat_entry *ne, *cur;
	struct page *page = NULL;

	/*
	 * there are two steps to flush nat entries:
	 * #1, flush nat entries to journal in current hot data summary block.
	 * #2, flush nat entries to nat page.
	 */
	if (enabled_nat_bits(sbi, cpc) ||
		!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL))
		to_journal = false;

注释中描述，flush nat entries有两个步骤:

1) 将nat entries flush 到当前的hot data summary block journal中

2) 将nat entries flush到nat page中。

判断journal中是否有足够的free space，如果有，to_journal=true, 否则，to_journal=false。

	if (to_journal) {
		down_write(&curseg->journal_rwsem);
	} else {
		page = get_next_nat_page(sbi, start_nid);
		nat_blk = page_address(page);
		f2fs_bug_on(sbi, !nat_blk);
	}

如果to_journal=true，则后面会将nat set entries写到journal中，所以此时获取journal_rwsem锁，如果to_journal=false，则需要得到nat cache中的空间，将nat set entries写入到nat cache page中。得到nat cache 中的Page是通过get_next_nat_page得到的，看一下这个函数：


static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
{
	struct page *src_page;
	struct page *dst_page;
	pgoff_t src_off;
	pgoff_t dst_off;
	void *src_addr;
	void *dst_addr;
	struct f2fs_nm_info *nm_i = NM_I(sbi);
    //得到当前nid对应的 nat cache page 偏移地址
	src_off = current_nat_addr(sbi, nid);
	//得到要写入的nat cache page address 偏移地址
	dst_off = next_nat_addr(sbi, src_off);

	/* get current nat block page with lock */
	//得到当前nat cache page
	src_page = get_meta_page(sbi, src_off);
	//得到要写入的nat cache page
	dst_page = grab_meta_page(sbi, dst_off);
	f2fs_bug_on(sbi, PageDirty(src_page));

	src_addr = page_address(src_page);
	dst_addr = page_address(dst_page);
	//将当前page中的内容拷贝到目的page中，并设置目的page为dirty
	//当前page执行put操作，如果索引为0时，则可以进行释放
	memcpy(dst_addr, src_addr, PAGE_SIZE);
	set_page_dirty(dst_page);
	f2fs_put_page(src_page, 1);

    //将nat_bitmap中索引设置为目的page，这样再查找时会返回目的page
	set_to_next_nat(nm_i, nid);

	return dst_page;
}

f2fs为了防止元数据丢失，SIT area及NAT area的数据都包含两份，从f2fs format过程可以看到，两份数据中，一个保存的数据是最新的，get_next_nat_page目的就是得到另一个副本中相应的nat page，做为下一步写入的page，同时会更新nat bitmap。

/* flush dirty nats in nat entry set */
	list_for_each_entry_safe(ne, cur, &set->entry_list, list) {
		struct f2fs_nat_entry *raw_ne;
		nid_t nid = nat_get_nid(ne);
		int offset;

		f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR);

		if (to_journal) {
			offset = lookup_journal_in_cursum(journal,
							NAT_JOURNAL, nid, 1);
			f2fs_bug_on(sbi, offset < 0);
			raw_ne = &nat_in_journal(journal, offset);
			nid_in_journal(journal, offset) = cpu_to_le32(nid);
		} else {
			raw_ne = &nat_blk->entries[nid - start_nid];
		}
		raw_nat_from_node_info(raw_ne, &ne->ni);
		nat_reset_flag(ne);
		__clear_nat_cache_dirty(NM_I(sbi), set, ne);
		if (nat_get_blkaddr(ne) == NULL_ADDR) {
			add_free_nid(sbi, nid, false, true);
		} else {
			spin_lock(&NM_I(sbi)->nid_list_lock);
			update_free_nid_bitmap(sbi, nid, false, false);
			spin_unlock(&NM_I(sbi)->nid_list_lock);
		}
	}

上面这一段代码，如果to_journal=true，则将nat_entry内容写入到journal中，如果to_journal=false，则将nat_entry内容写入一得到的nat cache page中。并设置相应的flag。

	if (to_journal) {
		up_write(&curseg->journal_rwsem);
	} else {
		__update_nat_bits(sbi, start_nid, page);
		f2fs_put_page(page, 1);
	}

最后，如果to_journal=true，释放journal_rwsem，说明已写完，如果to_journal=false, 则f2fs_pu_page（1），如果page索引为0，可以真正的写回此page到磁盘。

到这里，f2fs flush_nat_entries流程就结束了，它的主要作用就是将nat_set_root中所有的nat_set中的entries执行flush写回操作。

东大坡居士

关注

3
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
f2fs write_checkpoint 过程分析

write_checkpoint主要负责把cache中dirty的数据写回到磁盘中，在gc, trim, discard或者recovery的时候都会调用到。int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc){ struct f2fs_checkpoint *ckpt = F2FS_CKPT(s...
复制链接

扫一扫