在前面的学习中,我们已经知道了一个文件的打开和读写时的系统调用过程,但是它们都在VFS层及之前,今天我们尝试以F2FS为具体操作系统,观察它的写入过程并着重观察F2FS冷热分流在其中的作用。这篇文章会比较长。
数据写流程整体概览:
首先我们要明确的是,在调用了write接口后,write的内容并不会一下子啪的一下就到磁盘上,而是通过操作系统中熟悉的脏页回写机制冲刷到磁盘上(见附推分享),所以,在write中的调用过程是不会下发bio的。
write在vfs到具体文件系统是通过write_iter回调函数转接的,这个函数的主要作用是在数据写入之前进行预处理:
static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct inode *inode = file_inode(iocb->ki_filp);
const loff_t orig_pos = iocb->ki_pos;
const size_t orig_count = iov_iter_count(from);
loff_t target_size;
bool dio;
bool may_need_sync = true;
int preallocated;
ssize_t ret;
......
对inode加锁
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode)) {
ret = -EAGAIN;
goto out;
}
} else {
inode_lock(inode);
}
ret = f2fs_write_checks(iocb, from); // 做一些check
if (ret <= 0)
goto out_unlock;
/*!!dio 决定当前写是直接写到磁盘,还是写到缓存,dio=true将写直达 */
/* Determine whether we will do a direct write or a buffered write. */
dio = f2fs_should_use_dio(inode, iocb, from);
/* target_size = 文件当前写入位置+写入长度,即期望文件写入后位置
/* Possibly preallocate the blocks for the write. */
target_size = iocb->ki_pos + iov_iter_count(from);
/* 如果必要的话,为本次写请求预先申请空间,返回正数表明已经成功申请,
* 返回0表示没有需要预先申请的块,返回负数表示严重错误,如果inode的所有
* 请求的块都被申请,会设置FI_PREALLOCATED_ALL标志位
*/
preallocated = f2fs_preallocate_blocks(iocb, from, dio);
if (preallocated < 0) {
ret = preallocated; // 发生了严重错误
} else {
......(trace相关)
skip_write_trace:
/* Do the actual write. */
ret = dio ?
f2fs_dio_write_iter(iocb, from, &may_need_sync): // dio为true以写直达方式写
f2fs_buffered_write_iter(iocb, from); // dio为false写到缓冲区
if (trace_f2fs_datawrite_end_enabled())
trace_f2fs_datawrite_end(inode, orig_pos, ret);
}
/* Don't leave any preallocated blocks around past i_size. */
if (preallocated && i_size_read(inode) < target_size) {
f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
filemap_invalidate_lock(inode->i_mapping);
if (!f2fs_truncate(inode))
file_dont_truncate(inode);
filemap_invalidate_unlock(inode->i_mapping);
f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
} else {
file_dont_truncate(inode);
}
clear_inode_flag(inode, FI_PREALLOCATED_ALL);
out_unlock:
inode_unlock(inode);
out:
trace_f2fs_file_write_iter(inode, orig_pos, orig_count, ret);
if (ret > 0 && may_need_sync)
ret = generic_write_sync(iocb, ret);
return ret;
}
在第28行,通过函数f2fs_should_use_dio判断本次write应该写入到缓冲区或是直接提交到磁盘。这将决定着两种不同的处理方式。
下面来分析一下36行中的函数:
static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
bool dio)
{
struct inode *inode = file_inode(iocb->ki_filp); // 获取inode
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
const loff_t pos = iocb->ki_pos;
const size_t count = iov_iter_count(iter);
struct f2fs_map_blocks map = {};
int flag;
int ret;
/* If it will be an out-of-place direct write, don't bother. */
if (dio && f2fs_lfs_mode(sbi))
return 0;
/*
* Don't preallocate holes aligned to DIO_SKIP_HOLES which turns into
* buffered IO, if DIO meets any holes.
*/
if (dio && i_size_read(inode) &&
(F2FS_BYTES_TO_BLK(pos) < F2FS_BLK_ALIGN(i_size_read(inode))))
return 0;
/* No-wait I/O can't allocate blocks. */
if (iocb->ki_flags & IOCB_NOWAIT)
return 0;
/* If it will be a short write, don't bother. */
if (fault_in_iov_iter_readable(iter, count))
return 0;
if (f2fs_has_inline_data(inode)) {
/* If the data will fit inline, don't bother. */
if (pos + count <= MAX_INLINE_DATA(inode))
return 0;
ret = f2fs_convert_inline_inode(inode);
if (ret)
return ret;
}
/* Do not preallocate blocks that will be written partially in 4KB. */
map.m_lblk = F2FS_BLK_ALIGN(pos); // 根据当前文件偏移计算从第几个block开始写入
map.m_len = F2FS_BYTES_TO_BLK(pos + count); // 写入的终点块
if (map.m_len > map.m_lblk)
map.m_len -= map.m_lblk; // 终点-起点=本次写入块数量
else
map.m_len = 0;
map.m_may_create = true;
if (dio) {
// 对于dio请求,在这里进行了冷热划分
map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
flag = F2FS_GET_BLOCK_PRE_DIO; // flag设置为dio
} else {
map.m_seg_type = NO_CHECK_TYPE;
flag = F2FS_GET_BLOCK_PRE_AIO; // flag设置为aio
}
ret = f2fs_map_blocks(inode, &map, 1, flag); // 进行初始化
/* -ENOSPC|-EDQUOT are fine to report the number of allocated blocks. */
if (ret < 0 && !((ret == -ENOSPC || ret == -EDQUOT) && map.m_len > 0))
return ret;
if (ret == 0)
set_inode_flag(inode, FI_PREALLOCATED_ALL);
return map.m_len;
}
---数据结构f2fs_map_blocks:
struct f2fs_map_blocks {
struct block_device *m_bdev; /* for multi-device dio */
block_t m_pblk; /* 起始块物理地址 */
block_t m_lblk; /* 起始块逻辑位置 */
unsigned int m_len; /* 块数量 */
unsigned int m_flags; /* 数据状态flag */
pgoff_t *m_next_pgofs; /* 指向下一个offset point next possible non-hole pgofs */
pgoff_t *m_next_extent; /* point to next possible extent */
int m_seg_type; /*!! 段的温度 */
bool m_may_create; /* indicate it is from write path */
bool m_multidev_dio; /* indicate it allows multi-device dio */
};
接着在57行中,调用f2fs_map_blocks函数,尝试去找到和建立逻辑地址(文件偏移指针)找到对应的物理地址(block号)的映射关系,并在f2fs_map_blocks数据结构中返回
f2fs_map_blocks是一个相当复杂的函数,作用是先根据逻辑地址将物理地址读取出来,如果这个物理地址没有被分配过,则初始化为新地址,用于下一步的磁盘写操作,核心逻辑如下:
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag)
{
unsigned int maxblocks = map->m_len;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int mode = create ? ALLOC_NODE : LOOKUP_NODE;
map->m_len = 0;
map->m_flags = 0;
pgofs = (pgoff_t)map->m_lblk; // 获得文件访问偏移量
end = pgofs + maxblocks; // 获得需要读取的block的长度
next_dnode:
set_new_dnode(&dn, inode, NULL, NULL, 0); // 初始化dnode,dnode的作用是根据逻辑地址找到物理地址
// 根据inode找到对应的f2fs_inode或者direct_node结构,然后通过pgofs(文件页偏移)获得物理地址,记录在dn中
err = f2fs_get_dnode_of_data(&dn, pgofs, mode);
start_pgofs = pgofs;
prealloc = 0;
last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
// 根据dn获得物理地址,ofs_in_node表示这个物理地址位于当前node的第几个数据块
// 如 f2fs_inode->i_addr[3],那么dn.ofs_in_node=3
blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
...
if (__is_valid_data_blkaddr(blkaddr)) { // 存在旧数据
/* use out-place-update for driect IO under LFS mode */
if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create) {
err = __allocate_data_block(&dn, map->m_seg_type); // 按照m_seg_type分配数据块
if (err)
goto sync_out;
blkaddr = dn.data_blkaddr;
set_inode_flag(inode, FI_APPEND_WRITE); // 采用追加写
}
} else { // 不存在旧数据
if (create) {
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
goto sync_out;
}
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
if (blkaddr == NULL_ADDR) {
prealloc++; // 记录有多少个预分配block
last_ofs_in_node = dn.ofs_in_node;
}
} else {
WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO &&
flag != F2FS_GET_BLOCK_DIO);
err = __allocate_data_block(&dn,
map->m_seg_type); // DIO按照m_seg_type分配块
if (!err) {
if (flag == F2FS_GET_BLOCK_PRE_DIO)
file_need_truncate(inode);
set_inode_flag(inode, FI_APPEND_WRITE);
}
}
......
}
......
// 记录处理了多少个block
dn.ofs_in_node++;
pgofs++;
......
// 这里表示已经处理到最后一个block了
if (flag == F2FS_GET_BLOCK_PRE_AIO &&
(pgofs == end || dn.ofs_in_node == end_offset)) {
dn.ofs_in_node = ofs_in_node; // 回到第一个block
err = f2fs_reserve_new_blocks(&dn, prealloc); // 通过这个函数将其地址设置为NEW_ADDR
map->m_len += dn.ofs_in_node - ofs_in_node;
dn.ofs_in_node = end_offset;
}
...
if (pgofs >= end)
goto sync_out; // 表示已经全部处理完,可以退出这个函数了
else if (dn.ofs_in_node < end_offset)
goto next_block; // goto式循环,每执行上面的流程就处理一个block,如果没有处理所有用户写入的block,那么回去继续处理
...
sync_out:
...
out:
return err;
}
根据是否为dio,最终处理路径发生变化,若为aio,实际使用f2fs_buffered_write_iter函数处理,若为dio,实际使用f2fs_dio_write_iter函数处理。
首先看aio的情况:
aio:
static ssize_t f2fs_buffered_write_iter(struct kiocb *iocb,
struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
ssize_t ret;
if (iocb->ki_flags & IOCB_NOWAIT)
return -EOPNOTSUPP;
current->backing_dev_info = inode_to_bdi(inode); // FLUSH时使用
ret = generic_perform_write(file, from, iocb->ki_pos);
current->backing_dev_info = NULL;
if (ret > 0) {
iocb->ki_pos += ret;
f2fs_update_iostat(F2FS_I_SB(inode), APP_BUFFERED_IO, ret);
}
return ret;
}
--- 实际上起作用的函数是generic_perform_write
ssize_t generic_perform_write(struct file *file,
struct iov_iter *i, loff_t pos)
{
struct address_space *mapping = file->f_mapping;
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0; // 已写的字节数
unsigned int flags = 0;
do {
struct page *page;
unsigned long offset; /* Offset into pagecache page 页面中偏移 */
unsigned long bytes; /* Bytes to write to page 要写到页面的字节数 */
size_t copied; /* Bytes copied from user 从用户空间复制过来的字节数 */
void *fsdata;
offset = (pos & (PAGE_SIZE - 1)); // 计算页内偏移
bytes = min_t(unsigned long, PAGE_SIZE - offset,
iov_iter_count(i)); // 要写到页面的字节数不能超过页面剩下的字节数
again:
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
status = -EFAULT;
break;
}
if (fatal_signal_pending(current)) {
status = -EINTR;
break;
}
// 准备写file,地址空间mapping,从pos开始,写bytes长度,页面分配完成的结果放在page指针中
status = a_ops->write_begin(file, mapping, pos, bytes, flags,
&page, &fsdata); // 调用address_space的write_begin通知具体操作系统
if (unlikely(status < 0))
break;
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
// 将数据从用户空间复制到页面中,并且会移动iov_iter
copied = copy_page_from_iter_atomic(page, offset, bytes, i);
flush_dcache_page(page);// 将包含用户数据的page加入到page cache中,等待系统触发writeback的时候回写
// 复制完毕,调用write_end通知具体文件系统,数据已复制到页面,可以提交磁盘
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
if (unlikely(status != copied)) { // 正常情况下,返回status为copied
iov_iter_revert(i, copied - max(status, 0L));
if (unlikely(status < 0))
break;
}
cond_resched();
if (unlikely(status == 0)) {
/*
* A short copy made ->write_end() reject the
* thing entirely. Might be memory poisoning
* halfway through, might be a race with munmap,
* might be severe memory pressure.
*/
if (copied)
bytes = copied;
goto again;
}
pos += status;
written += status;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i)); // 分批处理迭代器中的字节,直到处理完毕退出循环
return written ? written : status;
}
总体上来说,就是调用了write_begin,把数据复制到page,再调用write_end三个步骤。
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct page *page = NULL;
pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
bool need_balance = false, drop_atomic = false;
block_t blkaddr = NULL_ADDR;
int err = 0;
......
repeat:
/*
* Do not use grab_cache_page_write_begin() to avoid deadlock due to
* wait_for_stable_page. Will wait that below with our IO control.
*/
page = f2fs_pagecache_get_page(mapping, index,
FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); // 第一步创建或者获取page cache
if (!page) {
err = -ENOMEM;
goto fail;
}
/* TODO: cluster can be compressed due to race with .writepage */
*pagep = page;
err = prepare_write_begin(sbi, page, pos, len,
&blkaddr, &need_balance); // 根据页偏移获取对应的物理地址存在blkaddr中
if (err)
goto fail;
if (need_balance && !IS_NOQUOTA(inode) &&
has_not_enough_free_secs(sbi, 0, 0)) {
unlock_page(page);
f2fs_balance_fs(sbi, true); // 前台GC,平衡脏节点和目录项页面,也控制垃圾回收,如果没有free_secs,则开始垃圾回收
lock_page(page);
if (page->mapping != mapping) {
/* The page got truncated from under us */
f2fs_put_page(page, 1);
goto repeat;
}
}
// 如果PageWriteback设置了, 那么当前线程就要调用
// wait_on_page_writeback 函数, 然后进入一个while循环判断,
// 如果依然被置位,那么你就被仍在队列中接着睡觉, 否则进入下个竞争过程.
// 在对一个页开始写前,都需要调用此函数
f2fs_wait_on_page_writeback(page, DATA, false, true);
if (len == PAGE_SIZE || PageUptodate(page))
return 0; // 需要写入的长度为一页或者页面已经是最新
if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) &&
!f2fs_verity_in_progress(inode)) {
zero_user_segment(page, len, PAGE_SIZE);
return 0;
}
if (blkaddr == NEW_ADDR) { // 物理地址是新分配地址
zero_user_segment(page, 0, PAGE_SIZE); // 直接用0填充
SetPageUptodate(page); // 标记页为最新
} else { // 可能是覆盖写的情况
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
DATA_GENERIC_ENHANCE_READ)) { // 不是有效地址,报错
err = -EFSCORRUPTED;
goto fail;
}
err = f2fs_submit_page_read(inode, page, blkaddr, 0, true); // 把这一页从磁盘里读出来
if (err)
goto fail;
lock_page(page);
if (unlikely(page->mapping != mapping)) {
f2fs_put_page(page, 1);
goto repeat;
}
if (unlikely(!PageUptodate(page))) {
err = -EIO;
goto fail;
}
}
return 0;
fail:
f2fs_put_page(page, 1);
f2fs_write_failed(inode, pos + len);
if (drop_atomic)
f2fs_drop_inmem_pages_all(sbi, false);
return err;
}
完成数据复制后,调用write_end函数:
static int f2fs_write_end(struct file *file,
struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = page->mapping->host;
/*
* This should be come from len == PAGE_SIZE, and we expect copied
* should be PAGE_SIZE. Otherwise, we treat it with zero copied and
* let generic_perform_write() try to copy data again through copied=0.
*/
if (!PageUptodate(page)) { // 判断页面是否已为最新
if (unlikely(copied != len))
copied = 0;
else
SetPageUptodate(page); // 设置页面为最新
}
if (!copied)
goto unlock_out;
set_page_dirty(page); // 标记为脏页,会加入到inode->mapping的基树中,等待系统回写
if (pos + copied > i_size_read(inode) &&
!f2fs_verity_in_progress(inode))
f2fs_i_size_write(inode, pos + copied); // 更新文件大小
unlock_out:
f2fs_put_page(page, 1);
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); // 更新修改时间
return copied;
}
在23行的set_page_dirty中,最终还是会落实到aops中的set_page_dirty函数中,最后实际由f2fs_set_data_page_dirty函数执行。
static int f2fs_set_data_page_dirty(struct page *page)
{
struct inode *inode = page_file_mapping(page)->host;
trace_f2fs_set_page_dirty(page, DATA);
if (!PageUptodate(page))
SetPageUptodate(page); // 要设置为脏页的时候,页面已经是最新了
if (PageSwapCache(page))
return __set_page_dirty_nobuffers(page);
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
if (!page_private_atomic(page)) {
f2fs_register_inmem_page(inode, page);
return 1;
}
/*
* Previously, this page has been registered, we just
* return here.
*/
return 0;
}
if (!PageDirty(page)) {
__set_page_dirty_nobuffers(page);
// -> __set_page_dirty_nobuffers -> __set_page_dirty 设置page和inode为脏
f2fs_update_dirty_page(inode, page);
// -> inode_inc_dirty_pages F2FS_I(inode)->dirty_pages值加1,之后有用
return 1;
}
return 0;
}
write操作完成后,实际上数据仍然在内存中,需要等待脏页回写时,才会被写到磁盘上。脏页回写机制见附推,我们这里主要关注F2FS实际干了什么。
vfs最终调用f2fs_write_data_pages处理数据页面回写的操作。
static int f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
return __f2fs_write_data_pages(mapping, wbc,
F2FS_I(inode)->cp_task == current ?
FS_CP_DATA_IO : FS_DATA_IO);
}
---
static int __f2fs_write_data_pages(struct address_space *mapping,
struct writeback_control *wbc,
enum iostat_type io_type)
{
struct inode *inode = mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct blk_plug plug;
int ret;
bool locked = false;
/* deal with chardevs and other special file */
if (!mapping->a_ops->writepage)
return 0;
/* skip writing if there is no dirty page in this inode */
if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
return 0;
/* during POR, we don't need to trigger writepage at all. */
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto skip_write;
if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) &&
wbc->sync_mode == WB_SYNC_NONE &&
get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
f2fs_available_free_memory(sbi, DIRTY_DENTS))
goto skip_write;
/* skip writing in file defragment preparing stage */
if (is_inode_flag_set(inode, FI_SKIP_WRITES))
goto skip_write;
trace_f2fs_writepages(mapping->host, wbc, DATA);
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[DATA]);
else if (atomic_read(&sbi->wb_sync_req[DATA])) {
/* to avoid potential deadlock */
if (current->plug)
blk_finish_plug(current->plug);
goto skip_write;
}
if (__should_serialize_io(inode, wbc)) {
mutex_lock(&sbi->writepages);
locked = true;
}
blk_start_plug(&plug);
ret = f2fs_write_cache_pages(mapping, wbc, io_type); // 取出需要回写的page,然后写入
blk_finish_plug(&plug);
if (locked)
mutex_unlock(&sbi->writepages);
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_dec(&sbi->wb_sync_req[DATA]);
/*
* if some pages were truncated, we cannot guarantee its mapping->host
* to detect pending bios.
*/
f2fs_remove_dirty_inode(inode); // 写入后将inode从dirty标志清除,即不需要再回写
return ret;
skip_write:
wbc->pages_skipped += get_dirty_pages(inode);
trace_f2fs_writepages(mapping->host, wbc, DATA);
return 0;
}
这个函数实际上进行了许多判断后,由f2fs_write_cache_pages函数来干活。
--- 入参数据结构writeback_control
/*
* A control structure which tells the writeback code what to do. These are
* always on the stack, and hence need no locking. They are always initialised
* in a manner such that unspecified fields are set to zero.
*/
struct writeback_control {
long nr_to_write; /* Write this many pages, and decrement
this for each page written */
long pages_skipped; /* Pages which were not written */
/*
* For a_ops->writepages(): if start or end are non-zero then this is
* a hint that the filesystem need only write out the pages inside that
* byterange. The byte at `end' is included in the writeout request.
*/
loff_t range_start;
loff_t range_end;
enum writeback_sync_modes sync_mode;
unsigned for_kupdate:1; /* A kupdate writeback */
unsigned for_background:1; /* A background writeback */
unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
......
};
---
/* 这里说,这个函数基本上是抄的,主要修改就是区分了热度
* This function was copied from write_cche_pages from mm/page-writeback.c.
* The major change is making write step of cold data page separately from
* warm/hot data page.
*/
static int f2fs_write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc,
enum iostat_type io_type)
{
int ret = 0;
int done = 0, retry = 0;
struct pagevec pvec;
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
struct bio *bio = NULL;
sector_t last_block;
int nr_pages;
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
int range_whole = 0;
xa_mark_t tag;
int nwritten = 0;
int submitted = 0;
int i;
pagevec_init(&pvec); // 这是一个用于装载page的数组,数组大小是15个page
if (get_dirty_pages(mapping->host) <=
SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
set_inode_flag(mapping->host, FI_HOT_DATA);
// min_hot_blocks默认值为16,少于16页(一个块=一页)认为文件是热的。
else
clear_inode_flag(mapping->host, FI_HOT_DATA);
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* prev offset */
end = -1; // 循环冲刷
} else {
index = wbc->range_start >> PAGE_SHIFT; // 冲刷开始页索引
end = wbc->range_end >> PAGE_SHIFT; // 冲刷结束页索引
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1; // 0~LLONG_MAX
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE; // 标记page属性
else
tag = PAGECACHE_TAG_DIRTY;
retry:
retry = 0;
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
// 将index->end范围内的PAGECACHE_TAG_DIRTY修改为PAGECACHE_TAG_TOWRITE,作用是SYNC模式下必须全部回写到磁盘
done_index = index;
while (!done && !retry && (index <= end)) {
// 从mapping中取出tag类型的15个page,装载到pvec中
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
tag);
if (nr_pages == 0)
break;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
bool need_readd;
readd:
need_readd = false;
/* give a priority to WB_SYNC threads */
if (atomic_read(&sbi->wb_sync_req[DATA]) &&
wbc->sync_mode == WB_SYNC_NONE) {
done = 1;
break;
}
done_index = page->index;
retry_write:
lock_page(page);
if (unlikely(page->mapping != mapping)) {
continue_unlock:
unlock_page(page);
continue;
}
if (!PageDirty(page)) {
/* someone wrote it for us 页面已经不脏了 */
goto continue_unlock;
}
if (PageWriteback(page)) { // 页面正在回写
if (wbc->sync_mode != WB_SYNC_NONE)
f2fs_wait_on_page_writeback(page,
DATA, true, true);
else
goto continue_unlock;
}
if (!clear_page_dirty_for_io(page)) // 清除脏页标记,如果page之前是脏的,返回true,不会进入if结构体内
goto continue_unlock;
// 写page,是否提交merge标记submmitted
ret = f2fs_write_single_data_page(page, &submitted,
&bio, &last_block, wbc, io_type,
0, true); // 真正执行写入操作的函数
if (ret == AOP_WRITEPAGE_ACTIVATE)
unlock_page(page);
nwritten += submitted; // submitted的取值为0或1,记录提交数
wbc->nr_to_write -= submitted; // 剩余应提交数
if (unlikely(ret)) {
/*
* keep nr_to_write, since vfs uses this to
* get # of written pages.
*/
if (ret == AOP_WRITEPAGE_ACTIVATE) {
ret = 0;
goto next;
} else if (ret == -EAGAIN) {
ret = 0;
if (wbc->sync_mode == WB_SYNC_ALL) {
f2fs_io_schedule_timeout(
DEFAULT_IO_TIMEOUT);
goto retry_write;
}
goto next;
}
done_index = page->index + 1;
done = 1;
break;
}
if (wbc->nr_to_write <= 0 &&
wbc->sync_mode == WB_SYNC_NONE) {
/*
enum writeback_sync_modes {
WB_SYNC_NONE, // Don't wait on anything 不等
WB_SYNC_ALL, // Wait on every mapping 需要等待所有冲刷完成
};
设置为 NONE的话,写完就跑 */
done = 1;
break;
}
next:
if (need_readd)
goto readd;
}
pagevec_release(&pvec); // 释放掉pvec
cond_resched();
}
if (retry) {
index = 0;
end = -1;
goto retry;
}
if (wbc->range_cyclic && !done)
done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
if (nwritten)
// page通过一些函数后,会放入到bio中,然后提交到磁盘。
// f2fs的机制是不会马上提交bio,需要等到bio包含了一定数目的page之后才会提交
// 因此这个函数作用是,只要有提交到merge的,强制提交bio,需要与磁盘同步
f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host,
NULL, 0, DATA);
/* submit cached bio of IPU write */
if (bio)
f2fs_submit_merged_ipu_write(sbi, &bio, NULL);
return ret;
}
f2fs_write_single_data_page函数根据写入的文件类型(目录文件、内联文件、普通文件),选择不同的方法进行写入。
int f2fs_write_single_data_page(struct page *page, int *submitted,
struct bio **bio,
sector_t *last_block,
struct writeback_control *wbc,
enum iostat_type io_type,
int compr_blocks,
bool allow_balance)
{
struct inode *inode = page->mapping->host;
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long)i_size)
>> PAGE_SHIFT;
loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
// 该数据结构记录了写入的信息,important
struct f2fs_io_info fio = {
.sbi = sbi,
.ino = inode->i_ino,
.type = DATA,
.op = REQ_OP_WRITE, // op在此处定义
.op_flags = wbc_to_write_flags(wbc),
.old_blkaddr = NULL_ADDR, // 记录旧地址
.page = page, // 即将写入的page
.encrypted_page = NULL,
.submitted = false,
.compr_blocks = compr_blocks,
.need_lock = LOCK_RETRY,
.io_type = io_type,
.io_wbc = wbc,
.bio = bio,
.last_block = last_block,
};
trace_f2fs_writepage(page, DATA);
/* we should bypass data pages to proceed the kworkder jobs */
if (unlikely(f2fs_cp_error(sbi))) {
mapping_set_error(page->mapping, -EIO);
/*
* don't drop any dirty dentry pages for keeping lastest
* directory structure.
*/
if (S_ISDIR(inode->i_mode))
goto redirty_out;
goto out;
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
if (page->index < end_index ||
f2fs_verity_in_progress(inode) ||
compr_blocks)
goto write;
/*
* If the offset is out-of-range of file size,
* this page does not have to be written to disk.
*/
offset = i_size & (PAGE_SIZE - 1);
if ((page->index >= end_index + 1) || !offset)
goto out;
zero_user_segment(page, offset, PAGE_SIZE);
write:
if (f2fs_is_drop_cache(inode))
goto out;
/* we should not write 0'th page having journal header */
if (f2fs_is_volatile_file(inode) && (!page->index ||
(!wbc->for_reclaim &&
f2fs_available_free_memory(sbi, BASE_CHECK))))
goto redirty_out;
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) {
// 针对目录文件的写法
/*
* We need to wait for node_write to avoid block allocation during
* checkpoint. This can only happen to quota writes which can cause
* the below discard race condition.
*/
if (IS_NOQUOTA(inode))
f2fs_down_read(&sbi->node_write);
fio.need_lock = LOCK_DONE;
err = f2fs_do_write_data_page(&fio);
if (IS_NOQUOTA(inode))
f2fs_up_read(&sbi->node_write);
goto done;
}
if (!wbc->for_reclaim)
need_balance_fs = true;
else if (has_not_enough_free_secs(sbi, 0, 0))
goto redirty_out;
else
set_inode_flag(inode, FI_HOT_DATA);
err = -EAGAIN;
if (f2fs_has_inline_data(inode)) {
// 针对内联文件的写法
err = f2fs_write_inline_data(inode, page);
if (!err)
goto out;
}
if (err == -EAGAIN) {
// 普通文件的方法
err = f2fs_do_write_data_page(&fio);
if (err == -EAGAIN) {
// 重试,增加锁要求
fio.need_lock = LOCK_REQ;
err = f2fs_do_write_data_page(&fio);
}
}
if (err) {
file_set_keep_isize(inode);
} else {
spin_lock(&F2FS_I(inode)->i_size_lock);
if (F2FS_I(inode)->last_disk_size < psize)
F2FS_I(inode)->last_disk_size = psize;
spin_unlock(&F2FS_I(inode)->i_size_lock);
}
done:
if (err && err != -ENOENT)
goto redirty_out;
out:
inode_dec_dirty_pages(inode); // 每写入一个page,将inode的一个dirty计数-1
if (err) {
ClearPageUptodate(page);
clear_page_private_gcing(page);
}
if (wbc->for_reclaim) {
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA);
clear_inode_flag(inode, FI_HOT_DATA);
f2fs_remove_dirty_inode(inode);
submitted = NULL;
}
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
!F2FS_I(inode)->cp_task && allow_balance)
f2fs_balance_fs(sbi, need_balance_fs); //GC
if (unlikely(f2fs_cp_error(sbi))) {
f2fs_submit_merged_write(sbi, DATA);
f2fs_submit_merged_ipu_write(sbi, bio, NULL);
submitted = NULL;
}
if (submitted)
*submitted = fio.submitted ? 1 : 0;
// 如果在本函数中没有调用过f2fs_submit_merged_write这样的函数
// 则它们应该在外层函数中被调用,需要返回1
return 0;
redirty_out:
redirty_page_for_writepage(wbc, page);
/*
* pageout() in MM traslates EAGAIN, so calls handle_write_error()
* -> mapping_set_error() -> set_bit(AS_EIO, ...).
* file_write_and_wait_range() will see EIO error, which is critical
* to return value of fsync() followed by atomic_write failure to user.
*/
if (!err || wbc->for_reclaim)
return AOP_WRITEPAGE_ACTIVATE;
unlock_page(page);
return err;
}
主要跟踪一个普通函数的读写,12行中函数的作用是根据系统的状态选择就地更新数据(inplace update)还是异地更新数据(outplace update)。一般情况下,系统只会在磁盘空间比较满的时候选择就地更新策略,避免触发过多的gc影响性能:
int f2fs_do_write_data_page(struct f2fs_io_info *fio)
{
struct page *page = fio->page;
struct inode *inode = page->mapping->host;
struct dnode_of_data dn;
struct extent_info ei = {0, };
struct node_info ni;
bool ipu_force = false;
int err = 0;
set_new_dnode(&dn, inode, NULL, NULL, 0);
if (need_inplace_update(fio) && // 这个函数判断了这个fio应该就地更新还是异地更新
f2fs_lookup_extent_cache(inode, page->index, &ei)) {
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
/* f2fs_lookup_extent_cache 函数查询了page->index所在的逻辑地址和物理地址映射关系,并存于ei
* 记录旧的块地址 = 文件起始的块地址 + 文件内偏移 = ei.blk - ei.fofs + page->index
* (推测)ei通过查询rbtree返回,表示该偏移对应的块地址段空间及相较于文件起始的偏移
* 这个映射关系在之前 f2fs_map_blocks处被建立
*/
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE))
return -EFSCORRUPTED;
ipu_force = true;
fio->need_lock = LOCK_DONE;
goto got_it; // 跳转避免旧地址被覆盖
}
/* Deadlock due to between page->lock and f2fs_lock_op */
if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi))
return -EAGAIN;
// 根据文件偏移获取dn信息,内含指向的物理地址
err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
if (err)
goto out;
// 异地更新保存旧的物理地址为dn指向的地址
fio->old_blkaddr = dn.data_blkaddr;
// 前面提及到f2fs_file_write_iter已经将物理地址设置为NEW_ADDR或者具体的block号,
// 因此这里表示在写入磁盘之前,用户又将这部分数据删除了,所以没必要写入了
/* This page is already truncated */
if (fio->old_blkaddr == NULL_ADDR) {
ClearPageUptodate(page);
clear_page_private_gcing(page);
goto out_writepage;
}
got_it:
if (__is_valid_data_blkaddr(fio->old_blkaddr) &&
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
DATA_GENERIC_ENHANCE)) {
err = -EFSCORRUPTED;
goto out_writepage;
}
/*
* If current allocation needs SSR,
* it had better in-place writes for updated data.
*/
if (ipu_force ||
(__is_valid_data_blkaddr(fio->old_blkaddr) &&
need_inplace_update(fio))) { // 就地更新路径
err = f2fs_encrypt_one_page(fio); // 如果开启加密,则先将fio->page加密
if (err)
goto out_writepage;
set_page_writeback(page); // 标志这一页正在写回,前面判断某页是否正在写回就判断是否经过该操作后
ClearPageError(page);
f2fs_put_dnode(&dn);
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
err = f2fs_inplace_write_data(fio); // 就地更新
if (err) {
if (fscrypt_inode_uses_fs_layer_crypto(inode))
fscrypt_finalize_bounce_page(&fio->encrypted_page);
if (PageWriteback(page))
end_page_writeback(page); // 结束写回状态
} else {
set_inode_flag(inode, FI_UPDATE_WRITE);
}
trace_f2fs_do_write_data_page(fio->page, IPU);
return err;
}
if (fio->need_lock == LOCK_RETRY) {
if (!f2fs_trylock_op(fio->sbi)) {
err = -EAGAIN;
goto out_writepage;
}
fio->need_lock = LOCK_REQ;
}
err = f2fs_get_node_info(fio->sbi, dn.nid, &ni, false);
if (err)
goto out_writepage;
fio->version = ni.version;
err = f2fs_encrypt_one_page(fio); // 如果需要加密
if (err)
goto out_writepage;
set_page_writeback(page); // 设置正在写回
ClearPageError(page);
if (fio->compr_blocks && fio->old_blkaddr == COMPRESS_ADDR)
f2fs_i_compr_blocks_update(inode, fio->compr_blocks - 1, false);
/* LFS mode write path */
f2fs_outplace_write_data(&dn, fio); // 进行异地更新
trace_f2fs_do_write_data_page(page, OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
if (page->index == 0)
set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
out_writepage:
f2fs_put_dnode(&dn);
out:
if (fio->need_lock == LOCK_REQ)
f2fs_unlock_op(fio->sbi);
return err;
}
因此,从该函数中,根据异地更新和就地更新的不同方式,再次做了区分,由于异地更新是主要的更新方式,我们主要跟踪异地更新,这个函数首先生成summary(每一个block都有一个summary,入口记录在SAA中,sum存放在CURSEG中),然后分配一个新的物理地址,将数据写入新的物理地址中,将旧地址无效化,最后更新逻辑地址和物理地址的映射关系,而且可以看出,它只需要更新直接节点信息,这是F2FS的一个特性:
SSA: https://blog.csdn.net/qq_38232437/article/details/108227856
void f2fs_outplace_write_data(struct dnode_of_data *dn,
struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = fio->sbi;
struct f2fs_summary sum;
f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
set_summary(&sum, dn->nid, dn->ofs_in_node, fio->version); // 生成summary
do_write_page(&sum, fio); // 完成新地址分配、复制、旧地址失效
f2fs_update_data_blkaddr(dn, fio->new_blkaddr); // 建立新的映射关系
f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE);
}
--- 更新映射关系:
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
dn->data_blkaddr = blkaddr; // 修改为新的物理地址
f2fs_set_data_blkaddr(dn); // 更新地址到节点上
f2fs_update_extent_cache(dn); // 更新cache中的映射关系
}
---
void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
{
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); // 首先等待写入完成
__set_data_blkaddr(dn);
if (set_page_dirty(dn->node_page))
dn->node_changed = true;
}
---
static void __set_data_blkaddr(struct dnode_of_data *dn)
{
struct f2fs_node *rn = F2FS_NODE(dn->node_page);
__le32 *addr_array;
int base = 0;
if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
base = get_extra_isize(dn->inode);
/* Get physical address of data block */
addr_array = blkaddr_in_node(rn);
addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); // 完成更新
}
主要干活的还是do_write_page函数:
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio); // 获取段的温度
bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
f2fs_down_read(&fio->sbi->io_order_lock);
reallocate:
// 分配新地址,并复制到新地址,新地址保存在fio->new_blkaddr中
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) {
invalidate_mapping_pages(META_MAPPING(fio->sbi),
fio->old_blkaddr, fio->old_blkaddr);
f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr);
}
/* writeout dirty page into bdev */
f2fs_submit_page_write(fio); // 提交读写bio到设备
if (fio->retry) {
fio->old_blkaddr = fio->new_blkaddr;
goto reallocate;
}
// 多设备更新设备状态
f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1);
if (keep_order)
f2fs_up_read(&fio->sbi->io_order_lock);
}
f2fs_allocate_data_block首先会根据温度获得CURSEG,然后在CURSEG分配一个新的物理块,然后将旧块无效化。
void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
block_t old_blkaddr, block_t *new_blkaddr,
struct f2fs_summary *sum, int type,
struct f2fs_io_info *fio)
{
struct sit_info *sit_i = SIT_I(sbi);
struct curseg_info *curseg = CURSEG_I(sbi, type);
unsigned long long old_mtime;
bool from_gc = (type == CURSEG_ALL_DATA_ATGC);
struct seg_entry *se = NULL;
f2fs_down_read(&SM_I(sbi)->curseg_lock);
mutex_lock(&curseg->curseg_mutex);
down_write(&sit_i->sentry_lock);
if (from_gc) {
f2fs_bug_on(sbi, GET_SEGNO(sbi, old_blkaddr) == NULL_SEGNO);
se = get_seg_entry(sbi, GET_SEGNO(sbi, old_blkaddr));
sanity_check_seg_type(sbi, se->type);
f2fs_bug_on(sbi, IS_NODESEG(se->type));
}
*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
// 获取新的物理地址,需要查位图
f2fs_bug_on(sbi, curseg->next_blkoff >= sbi->blocks_per_seg);
f2fs_wait_discard_bio(sbi, *new_blkaddr);
/*
* __add_sum_entry should be resided under the curseg_mutex
* because, this function updates a summary entry in the
* current summary block.
*/
__add_sum_entry(sbi, type, sum); // 把sum更新到CURSEG中
__refresh_next_blkoff(sbi, curseg); // 更新下一个可用的物理地址
stat_inc_block_count(sbi, curseg);
if (from_gc) {
old_mtime = get_segment_mtime(sbi, old_blkaddr);
} else {
update_segment_mtime(sbi, old_blkaddr, 0); // 更新老段时间信息
old_mtime = 0;
}
update_segment_mtime(sbi, *new_blkaddr, old_mtime);// 更新新段时间信息
/*
* SIT information should be updated before segment allocation,
* since SSR needs latest valid block information.
*/
update_sit_entry(sbi, *new_blkaddr, 1); // 根据新地址找到sit_entry,更新块为有效
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
update_sit_entry(sbi, old_blkaddr, -1); // 更新老地址块为-1,表示被覆盖了,等待GC回收
if (!__has_curseg_space(sbi, curseg)) { // 如果当前段没有空间再分配了
if (from_gc)
get_atssr_segment(sbi, type, se->type,
AT_SSR, se->mtime);
else
sit_i->s_ops->allocate_segment(sbi, type, false); // 那么就分配一个新的type温度的段
}
/*
* segment dirty status should be updated after segment allocation,
* so we just need to update status only one time after previous
* segment being closed.
*/
// 将segment设置为脏,等待checkpoint写回磁盘
locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
up_write(&sit_i->sentry_lock);
if (page && IS_NODESEG(type)) {
fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
f2fs_inode_chksum_set(sbi, page);
}
if (fio) {
struct f2fs_bio_info *io;
if (F2FS_IO_ALIGNED(sbi))
fio->retry = false;
INIT_LIST_HEAD(&fio->list);
fio->in_list = true; // 标记为在链表中
io = sbi->write_io[fio->type] + fio->temp;
spin_lock(&io->io_lock);
list_add_tail(&fio->list, &io->io_list); // 将fio链入到这个类型write_io的链表里面
spin_unlock(&io->io_lock);
}
mutex_unlock(&curseg->curseg_mutex);
f2fs_up_read(&SM_I(sbi)->curseg_lock);
}
最后一步,提交到磁盘,它会将同类型同温度的io全部提交:
void f2fs_submit_page_write(struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = fio->sbi;
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; // 把这个type的这个温度的io信息取出来
struct page *bio_page;
f2fs_bug_on(sbi, is_read_io(fio->op));
f2fs_down_write(&io->io_rwsem);
next:
if (fio->in_list) { // 它被上一个函数指定为true
spin_lock(&io->io_lock);
if (list_empty(&io->io_list)) {
spin_unlock(&io->io_lock);
goto out;
}
fio = list_first_entry(&io->io_list,
struct f2fs_io_info, list);
list_del(&fio->list); // 将它的链接件移除,相当于出队
spin_unlock(&io->io_lock);
}
verify_fio_blkaddr(fio);
if (fio->encrypted_page) // 根据加密或压缩情况赋值bio_page
bio_page = fio->encrypted_page;
else if (fio->compressed_page)
bio_page = fio->compressed_page;
else
bio_page = fio->page;
/* set submitted = true as a return value */
fio->submitted = true;
inc_page_count(sbi, WB_DATA_TYPE(bio_page));
if (io->bio &&
(!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio,
fio->new_blkaddr) ||
!f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host,
bio_page->index, fio)))
__submit_merged_bio(io); // 不能合并直接提交的状态
alloc_new:
if (io->bio == NULL) {
if (F2FS_IO_ALIGNED(sbi) &&
(fio->type == DATA || fio->type == NODE) &&
fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
dec_page_count(sbi, WB_DATA_TYPE(bio_page));
fio->retry = true;
goto skip;
}
io->bio = __bio_alloc(fio, BIO_MAX_VECS); // submit之后为NULL
f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host,
bio_page->index, fio, GFP_NOIO);
io->fio = *fio;
}
if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) {
__submit_merged_bio(io);
goto alloc_new;
} // 将page加入到bio中,如果<page_size表示bio已满,直接提交,下一个生成一个新的交
if (fio->io_wbc)
wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr; // 最后一个物理块就是我们最新写入的这个块
trace_f2fs_submit_page_write(fio->page, fio);
skip:
if (fio->in_list)
goto next; // 会将整个链表处理完成
out:
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
!f2fs_is_checkpoint_ready(sbi))
__submit_merged_bio(io);
f2fs_up_write(&io->io_rwsem);
}
---在53行中调用了__bio_alloc生成bio:
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
struct bio *bio;
bio = bio_alloc_bioset(GFP_NOIO, npages, &f2fs_bioset);
f2fs_target_device(sbi, fio->new_blkaddr, bio);
if (is_read_io(fio->op)) {
bio->bi_end_io = f2fs_read_end_io;
bio->bi_private = NULL; // 如果是读,private不加任何信息
} else {
bio->bi_end_io = f2fs_write_end_io;
bio->bi_private = sbi; // 如果是写,private加入sbi
bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
fio->type, fio->temp); // bi_write_hint为温度信息
}
iostat_alloc_and_bind_ctx(sbi, bio, NULL);
if (fio->io_wbc)
wbc_init_bio(fio->io_wbc, bio);
return bio;
}
在这个函数,当bio还没有填满page的时候是不会被提交到磁盘的,这是因为F2FS通过增大bio的size提高了写性能。因此,在用户fsync或者系统writeback的时候,为了保证这些page都可以刷写到磁盘,会如f2fs_write_cache_pages函数所介绍一样,通过f2fs_submit_merged_write_cond函数或者其他函数强行提交这个page未满的bio。
最后将会调用到submit_bio提交内核block层处理。内核的处理如block层处理,scsi层处理,UFS处理等文章中也有过分析。
bio完成后,会回调通知f2fs,主要进行一些内存页的状态修改和回收操作:
static void f2fs_write_end_io(struct bio *bio)
{
struct f2fs_sb_info *sbi;
struct bio_vec *bvec;
struct bvec_iter_all iter_all;
iostat_update_and_unbind_ctx(bio, 1);
sbi = bio->bi_private; // __alloc_bio的时候埋下的
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
bio->bi_status = BLK_STS_IOERR;
}
bio_for_each_segment_all(bvec, bio, iter_all) { // 对写入的每个segment
struct page *page = bvec->bv_page; // 取出页面
enum count_type type = WB_DATA_TYPE(page);
if (page_private_dummy(page)) {
clear_page_private_dummy(page);
unlock_page(page);
mempool_free(page, sbi->write_io_dummy);
if (unlikely(bio->bi_status))
f2fs_stop_checkpoint(sbi, true);
continue;
} // 回收内存页
fscrypt_finalize_bounce_page(&page);
if (unlikely(bio->bi_status)) {
mapping_set_error(page->mapping, -EIO);
if (type == F2FS_WB_CP_DATA)
f2fs_stop_checkpoint(sbi, true);
}
f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
page->index != nid_of_node(page));
dec_page_count(sbi, type);
if (f2fs_in_warm_node_list(sbi, page))
f2fs_del_fsync_node_entry(sbi, page);
clear_page_private_gcing(page);
end_page_writeback(page); // 结束页面的写回状态
}
if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
wq_has_sleeper(&sbi->cp_wait))
wake_up(&sbi->cp_wait);
bio_put(bio);
}
到此完成了一个数据的写流程,但是,对于元数据,比如SIT和NAT,它们并没有落盘,它们实际的落盘时间,是在checkpoint时,才会被写入磁盘,这也是避免雪崩效应的优化内容,至于这个流程,有必要的话之后再分析。
“由于f2fs的log-structure特性,每次写一个数据块,需要相应更改direct node,NAT和SIT,尤其是NAT和SIT区域,可能仅仅需要修改一个entry几个字节的信息,就要重写整个page,这会严重降低文件系统的性能和SSD的使用寿命,因此,f2fs使用了journal的机制来减少NAT和SIT的写次数。所谓journal,其实就是把NAT和SIT的更改写到f2fs_summary_block中,当写checkpoint时,才把dirty的SIT和NAT区域回写。这里就是fsfs 避免wander tree ”
------------
文章参考:
https://blog.csdn.net/u011649400/article/details/94589060
https://blog.51cto.com/xiamachao/2348759