f2fs开源的代码分为f2fs-tools与f2fs kernel代码,f2fs-tools的代码主要是格式化make_f2fs工具,fsck工具等,可以在开源网站
https://git.kernel.org/pub/scm/linux/kernel/git/上面下载。
格式化的源码在f2fs-tools源码f2fs-format.c中,f2fs_format_device()中执行的,次函数中依次填充sb, sit, nat, root_dir, cp等。系列1写了格式化过程往cp block中写了哪些数据,本次分析一下源码往super block里面写了些什么,个人以为,只有了解了磁盘格式,才能真正的理解一个文件系统。
本次看一下f2fs_prepare_super_block()函数。
static int f2fs_prepare_super_block(void)
{
u_int32_t blk_size_bytes;
u_int32_t log_sectorsize, log_sectors_per_block;
u_int32_t log_blocksize, log_blks_per_seg;
u_int32_t segment_size_bytes, zone_size_bytes;
u_int32_t sit_segments, nat_segments;
u_int32_t blocks_for_sit, blocks_for_nat, blocks_for_ssa;
u_int32_t total_valid_blks_available;
u_int64_t zone_align_start_offset, diff;
u_int64_t total_meta_zones, total_meta_segments;
u_int32_t sit_bitmap_size, max_sit_bitmap_size;
u_int32_t max_nat_bitmap_size, max_nat_segments;
u_int32_t total_zones;
enum quota_type qtype;
int i;
//super block magic
set_sb(magic, F2FS_SUPER_MAGIC);
set_sb(major_ver, F2FS_MAJOR_VERSION);
set_sb(minor_ver, F2FS_MINOR_VERSION);
log_sectorsize = log_base_2(c.sector_size);//sector size,512bytes
log_sectors_per_block = log_base_2(c.sectors_per_blk);//sertors per block
log_blocksize = log_sectorsize + log_sectors_per_block;//block size
log_blks_per_seg = log_base_2(c.blks_per_seg);//blocks per segment, default 1
set_sb(log_sectorsize, log_sectorsize);
set_sb(log_sectors_per_block, log_sectors_per_block);
set_sb(log_blocksize, log_blocksize);
set_sb(log_blocks_per_seg, log_blks_per_seg);
set_sb(segs_per_sec, c.segs_per_sec);
set_sb(secs_per_zone, c.secs_per_zone);
blk_size_bytes = 1 << log_blocksize;//block size, 4096 bytes
segment_size_bytes = blk_size_bytes * c.blks_per_seg;//default
//zone size, secs_per_zone is 1, segs_per_sec is 1
zone_size_bytes =
blk_size_bytes * c.secs_per_zone *
c.segs_per_sec * c.blks_per_seg;
set_sb(checksum_offset, 0);
set_sb(block_count, c.total_sectors >> log_sectors_per_block);//block count
//zone alignment, 每个zone其实与segment大小是相同的(默认一个segment等于一个zone)
//这里是segment对齐的起始位置,前面是superblock,superblock后都是segment对齐的
zone_align_start_offset =
(c.start_sector * c.sector_size +
2 * F2FS_BLKSIZE + zone_size_bytes - 1) /
zone_size_bytes * zone_size_bytes -
c.start_sector * c.sector_size;
if (c.start_sector % c.sectors_per_blk) {
MSG(1, "\t%s: Align start sector number to the page unit\n",
c.zoned_mode ? "FAIL" : "WARN");
MSG(1, "\ti.e., start sector: %d, ofs:%d (sects/page: %d)\n",
c.start_sector,
c.start_sector % c.sectors_per_blk,
c.sectors_per_blk);
if (c.zoned_mode)
return -1;
}
//将segment0_blkaddr设置为zone_align_start_offset,也就是cp segment其实位置
set_sb(segment0_blkaddr, zone_align_start_offset / blk_size_bytes);
sb->cp_blkaddr = sb->segment0_blkaddr;//如前面所述,cp blkaddr设置为segment0_blkaddr
MSG(0, "Info: zone aligned segment0 blkaddr: %u\n",
get_sb(segment0_blkaddr));
if (c.zoned_mode && (get_sb(segment0_blkaddr) + c.start_sector /
c.sectors_per_blk) % c.zone_blocks) {
MSG(1, "\tError: Unaligned segment0 block address %u\n",
get_sb(segment0_blkaddr));
return -1;
}
for (i = 0; i < c.ndevs; i++) {
if (i == 0) {
c.devices[i].total_segments =
(c.devices[i].total_sectors *
c.sector_size - zone_align_start_offset) /
segment_size_bytes;
c.devices[i].start_blkaddr = 0;
c.devices[i].end_blkaddr = c.devices[i].total_segments *
c.blks_per_seg - 1 +
sb->segment0_blkaddr;
} else {
c.devices[i].total_segments =
c.devices[i].total_sectors /
(c.sectors_per_blk * c.blks_per_seg);
c.devices[i].start_blkaddr =
c.devices[i - 1].end_blkaddr + 1;
c.devices[i].end_blkaddr = c.devices[i].start_blkaddr +
c.devices[i].total_segments *
c.blks_per_seg - 1;
}
if (c.ndevs > 1) {
memcpy(sb->devs[i].path, c.devices[i].path, MAX_PATH_LEN);
sb->devs[i].total_segments =
cpu_to_le32(c.devices[i].total_segments);
}
c.total_segments += c.devices[i].total_segments;
}
set_sb(segment_count, (c.total_segments / c.segs_per_zone *
c.segs_per_zone));//segment count totaly
set_sb(segment_count_ckpt, F2FS_NUMBER_OF_CHECKPOINT_PACK);//cp pack count
set_sb(sit_blkaddr, get_sb(segment0_blkaddr) +
get_sb(segment_count_ckpt) * c.blks_per_seg);//sit area addr
blocks_for_sit = SIZE_ALIGN(get_sb(segment_count), SIT_ENTRY_PER_BLOCK);
sit_segments = SEG_ALIGN(blocks_for_sit);
set_sb(segment_count_sit, sit_segments * 2);//有两个sit area
set_sb(nat_blkaddr, get_sb(sit_blkaddr) + get_sb(segment_count_sit) *
c.blks_per_seg);//nat area block address
total_valid_blks_available = (get_sb(segment_count) -
(get_sb(segment_count_ckpt) +
get_sb(segment_count_sit))) * c.blks_per_seg;
blocks_for_nat = SIZE_ALIGN(total_valid_blks_available,
NAT_ENTRY_PER_BLOCK);//计算nat area需要多少个block
if (c.large_nat_bitmap) {
nat_segments = SEG_ALIGN(blocks_for_nat) *
DEFAULT_NAT_ENTRY_RATIO / 100;
set_sb(segment_count_nat, nat_segments ? nat_segments : 1);
max_nat_bitmap_size = (get_sb(segment_count_nat) <<
log_blks_per_seg) / 8;
set_sb(segment_count_nat, get_sb(segment_count_nat) * 2);
} else {//假设没有设置large_nat_bitmap
set_sb(segment_count_nat, SEG_ALIGN(blocks_for_nat));
max_nat_bitmap_size = 0;
}
/*
* The number of node segments should not be exceeded a "Threshold".
* This number resizes NAT bitmap area in a CP page.
* So the threshold is determined not to overflow one CP page
*/
sit_bitmap_size = ((get_sb(segment_count_sit) / 2) <<
log_blks_per_seg) / 8;//共需要多大的bitmap size来描述sit area
if (sit_bitmap_size > MAX_SIT_BITMAP_SIZE)
max_sit_bitmap_size = MAX_SIT_BITMAP_SIZE;
else
max_sit_bitmap_size = sit_bitmap_size;
if (c.large_nat_bitmap) {
/* use cp_payload if free space of f2fs_checkpoint is not enough */
if (max_sit_bitmap_size + max_nat_bitmap_size >
MAX_BITMAP_SIZE_IN_CKPT) {
u_int32_t diff = max_sit_bitmap_size +
max_nat_bitmap_size -
MAX_BITMAP_SIZE_IN_CKPT;
set_sb(cp_payload, F2FS_BLK_ALIGN(diff));
} else {
set_sb(cp_payload, 0);
}
} else {
/*
* It should be reserved minimum 1 segment for nat.
* When sit is too large, we should expand cp area.
* It requires more pages for cp.
*/
//1个segment 2M, 2*1024*1024/4096 512个block, 512/8 等于64,
//所以1个segment需要64bytes 大小的bitmapsize来描述
//所以1 cp中,留给sit bitmap size的最大值就是MAX_SIT_BITMAP_SIZE_IN_CKPT
if (max_sit_bitmap_size > MAX_SIT_BITMAP_SIZE_IN_CKPT) {
//如果sit bitmap size比cp中预留的最大值还大
//将cp中预留的空间设置为max_nat_bitmap_size
max_nat_bitmap_size = CHECKSUM_OFFSET -
sizeof(struct f2fs_checkpoint) + 1;
//设置cp需要额外多少个block描述sit bitmap size
set_sb(cp_payload, F2FS_BLK_ALIGN(max_sit_bitmap_size));
} else {
//否则的话,cp中nat bitmap紧跟在sit bitmap后
max_nat_bitmap_size =
CHECKSUM_OFFSET - sizeof(struct f2fs_checkpoint) + 1
- max_sit_bitmap_size;
set_sb(cp_payload, 0);//不需要额外的cp block描述sit bitmap,设置cp_payload 0
}
//max nat bitmap 最多能描述多少个segments
max_nat_segments = (max_nat_bitmap_size * 8) >> log_blks_per_seg;
//如果mat nat bitmap最多能描述的segments小于存储中的实际需要的nat segments count
if (get_sb(segment_count_nat) > max_nat_segments)
set_sb(segment_count_nat, max_nat_segments);
//共有2个nat area
set_sb(segment_count_nat, get_sb(segment_count_nat) * 2);
}
//nat后为ssa area
set_sb(ssa_blkaddr, get_sb(nat_blkaddr) + get_sb(segment_count_nat) *
c.blks_per_seg);
//此时剩余的valid blocks
total_valid_blks_available = (get_sb(segment_count) -
(get_sb(segment_count_ckpt) +
get_sb(segment_count_sit) +
get_sb(segment_count_nat))) *
c.blks_per_seg;
//ssa area需要多少个segments
blocks_for_ssa = total_valid_blks_available /
c.blks_per_seg + 1;
set_sb(segment_count_ssa, SEG_ALIGN(blocks_for_ssa));
//cp sit nat ssa共占有多少个segments
total_meta_segments = get_sb(segment_count_ckpt) +
get_sb(segment_count_sit) +
get_sb(segment_count_nat) +
get_sb(segment_count_ssa);
diff = total_meta_segments % (c.segs_per_zone);
if (diff)
set_sb(segment_count_ssa, get_sb(segment_count_ssa) +
(c.segs_per_zone - diff));
//total meta zones大小
total_meta_zones = ZONE_ALIGN(total_meta_segments *
c.blks_per_seg);
//main_blkaddr为除了原数据后的空间
set_sb(main_blkaddr, get_sb(segment0_blkaddr) + total_meta_zones *
c.segs_per_zone * c.blks_per_seg);
if (c.zoned_mode) {
/*
* Make sure there is enough randomly writeable
* space at the beginning of the disk.
*/
unsigned long main_blkzone = get_sb(main_blkaddr) / c.zone_blocks;
if (c.devices[0].zoned_model == F2FS_ZONED_HM &&
c.devices[0].nr_rnd_zones < main_blkzone) {
MSG(0, "\tError: Device does not have enough random "
"write zones for F2FS volume (%lu needed)\n",
main_blkzone);
return -1;
}
}
//total zones,每个zone只有一个segment默认
total_zones = get_sb(segment_count) / (c.segs_per_zone) -
total_meta_zones;
//默认每个zone只有一个section
set_sb(section_count, total_zones * c.secs_per_zone);
//默认每个section包含一个segment
set_sb(segment_count_main, get_sb(section_count) * c.segs_per_sec);
/* Let's determine the best reserved and overprovisioned space */
if (c.overprovision == 0)
c.overprovision = get_best_overprovision(sb);
if (c.overprovision == 0 || c.total_segments < F2FS_MIN_SEGMENTS ||
(c.devices[0].total_sectors *
c.sector_size < zone_align_start_offset) ||
(get_sb(segment_count_main) - 2) < c.reserved_segments) {
MSG(0, "\tError: Device size is not sufficient for F2FS volume\n");
return -1;
}
//设置预留的segments,给gc等用
c.reserved_segments =
(2 * (100 / c.overprovision + 1) + 6)
* c.segs_per_sec;
uuid_generate(sb->uuid);
/* precompute checksum seed for metadata */
if (c.feature & cpu_to_le32(F2FS_FEATURE_INODE_CHKSUM))
c.chksum_seed = f2fs_cal_crc32(~0, sb->uuid, sizeof(sb->uuid));
utf8_to_utf16(sb->volume_name, (const char *)c.vol_label,
MAX_VOLUME_NAME, strlen(c.vol_label));
set_sb(node_ino, 1);//设置node inode 1
set_sb(meta_ino, 2);//设置meta inode 2
set_sb(root_ino, 3);//设置root inode 3
c.next_free_nid = 4;//下一个free nid为4
//quota相关的先不看
if (c.feature & cpu_to_le32(F2FS_FEATURE_QUOTA_INO)) {
quotatype_bits = QUOTA_USR_BIT | QUOTA_GRP_BIT;
if (c.feature & cpu_to_le32(F2FS_FEATURE_PRJQUOTA))
quotatype_bits |= QUOTA_PRJ_BIT;
}
for (qtype = 0; qtype < F2FS_MAX_QUOTAS; qtype++) {
if (!((1 << qtype) & quotatype_bits))
continue;
sb->qf_ino[qtype] = cpu_to_le32(c.next_free_nid++);
MSG(0, "Info: add quota type = %u => %u\n",
qtype, c.next_free_nid - 1);
}
//这是什么feature?先不看
if (c.feature & cpu_to_le32(F2FS_FEATURE_LOST_FOUND))
c.lpf_ino = c.next_free_nid++;
if (total_zones <= 6) {
MSG(1, "\tError: %d zones: Need more zones "
"by shrinking zone size\n", total_zones);
return -1;
}
//是否配置了heap属性,如果配置,hot node为最后一个segment,否则为0
if (c.heap) {
c.cur_seg[CURSEG_HOT_NODE] =
last_section(last_zone(total_zones));
c.cur_seg[CURSEG_WARM_NODE] = prev_zone(CURSEG_HOT_NODE);
c.cur_seg[CURSEG_COLD_NODE] = prev_zone(CURSEG_WARM_NODE);
c.cur_seg[CURSEG_HOT_DATA] = prev_zone(CURSEG_COLD_NODE);
c.cur_seg[CURSEG_COLD_DATA] = 0;
c.cur_seg[CURSEG_WARM_DATA] = next_zone(CURSEG_COLD_DATA);
} else {
c.cur_seg[CURSEG_HOT_NODE] = 0;
c.cur_seg[CURSEG_WARM_NODE] = next_zone(CURSEG_HOT_NODE);
c.cur_seg[CURSEG_COLD_NODE] = next_zone(CURSEG_WARM_NODE);
c.cur_seg[CURSEG_HOT_DATA] = next_zone(CURSEG_COLD_NODE);
c.cur_seg[CURSEG_COLD_DATA] =
max(last_zone((total_zones >> 2)),
next_zone(CURSEG_COLD_NODE));
c.cur_seg[CURSEG_WARM_DATA] =
max(last_zone((total_zones >> 1)),
next_zone(CURSEG_COLD_DATA));
}
/* if there is redundancy, reassign it */
verify_cur_segs();
cure_extension_list();
/* get kernel version */
if (c.kd >= 0) {
dev_read_version(c.version, 0, VERSION_LEN);
get_kernel_version(c.version);
MSG(0, "Info: format version with\n \"%s\"\n", c.version);
} else {
get_kernel_uname_version(c.version);
}
memcpy(sb->version, c.version, VERSION_LEN);
memcpy(sb->init_version, c.version, VERSION_LEN);
sb->feature = c.feature;
return 0;
}