linux文件系统——ubifs之ubi子系统初始化(2)
文章目录
前言
概述
本文档主要介绍Linux UBI子系统的使用方法、attach、负载均衡等。
修订记录
日期 | 作者 | 版本 | 修改说明 |
---|---|---|---|
2023.10.10 | 枫潇潇 | V1.0.0 | 初始版本 |
kernel配置ubi
Device Drivers --->
<*> Memory Technology Device (MTD) support --->
<*> Enable UBI - Unsorted block images --->
--- Enable UBI - Unsorted block images
(4096) UBI wear-leveling threshold
(20) Maximum expected bad eraseblock count per 1024 eraseblocks
[] UBI Fastmap (Experimental feature)
< > MTD devices emulation driver (gluebi)
[*] Read-only block devices on top of UBI volumes
配置项 | 描述 |
---|---|
CONFIG_MTD_UBI_WL_THRESHOLD | 磨损平衡阈值,即最大与最小擦除计数值的差值,默认值:4096 |
CONFIG_MTD_UBI_BEB_LIMIT | 指定UBI在MTD设备上期望的最大坏物理擦除块数(每1024擦除块),默认值:20 |
CONFIG_MTD_UBI_GLUEBI | UBI 卷的基础上模拟 MTD 设备。除非您使用传统的软件,否则请不要启用此选项。 |
CONFIG_MTD_UBI_BLOCK | 启用只读UBI块设备支持。 |
UBI Attach
UBI attach 指 UBI 层与 MTD 设备进行绑定, ubi 的 attach 操作即可以通过用户空间的 ubiattach 工具进行,也可以 bootloader 通过启动参数传递的方式进行。 此过程只需要在上电初始化时执行一次即可。
ubi_init()
|--->open_mtd_device()
|--->ubi_attach_mtd_dev()
| |--->io_init()
| |--->ubi_attach()
| | |--->scan_all()
| | | |--->scan_peb()
| | |--->ubi_read_volume_table()
| | | |--->create_empty_lvol()
| | | |--->process_lvol()
| | | |--->init_volumes()
| | |--->ubi_wl_init()
| | |--->ubi_eba_init()
| |--->autoresize()
| |--->uif_init()
| |--->ubi_debugfs_init_dev()
|--->ubiblock_init()
attach 参数解析
module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 0400);
MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: mtd=<name|num|path>[,<vid_hdr_offs>[,max_beb_per1024[,ubi_num]]].\n"
"Multiple \"mtd\" parameters may be specified.\n"
"MTD devices may be specified by their number, name, or path to the MTD character device node.\n"
"Optional \"vid_hdr_offs\" parameter specifies UBI VID header position to be used by UBI. (default value if 0)\n"
"Optional \"max_beb_per1024\" parameter specifies the maximum expected bad eraseblock per 1024 eraseblocks. (default value ("
__stringify(CONFIG_MTD_UBI_BEB_LIMIT) ") if 0)\n"
"Optional \"ubi_num\" parameter specifies UBI device number which have to be assigned to the newly created UBI device (assigned automatically by default)\n"
"\n"
"Example 1: mtd=/dev/mtd0 - attach MTD device /dev/mtd0.\n"
"Example 2: mtd=content,1984 mtd=4 - attach MTD device with name \"content\" using VID header offset 1984, and MTD device number 4 with default VID header offset.\n"
"Example 3: mtd=/dev/mtd1,0,25 - attach MTD device /dev/mtd1 using default VID header offset and reserve 25*nand_size_in_blocks/1024 erase blocks for bad block handling.\n"
"Example 4: mtd=/dev/mtd1,0,0,5 - attach MTD device /dev/mtd1 to UBI 5 and using default values for the other fields.\n"
"\t(e.g. if the NAND *chipset* has 4096 PEB, 100 will be reserved for this UBI device).");
示例如下:
ubi参数:
ubi.mtd=rootfs ubi.mtd=firmware,0,20,1 ubi.mtd=/dev/mtdX,0,20,2 ubi.block=0,rootfs root=/dev/ubiblock0_0 rootfstype=squashfs
mtd分区参数:
mtdparts=spi0.1:512k(boot)ro,512k(misc),512k(pstore),512k(factory),1536k(rp),4608k(system)ro,4608k(recovery)ro,5632k(rootfs),40m(firmware),-(data)
ubi.mtd=firmware,0,20,1
firmware:mtd分区名,也可以是8,或者是/dev/mtd8
0:vid_hdr_offs
20:每1024块中保留块的数量,当前ubi保留块的数量为:20*nand_size_in_blocks/1024
1:ubi设备号
ubi_mtd_param_parse 函数对该字符串进行解析并据此实例化全局数组 mtd_dev_param。
// drivers/mtd/ubi/build.c
static int ubi_mtd_param_parse(const char *val, const struct kernel_param *kp)
{
int i, len;
struct mtd_dev_param *p;
char buf[MTD_PARAM_LEN_MAX]; // 参数最大长度为64byte
char *pbuf = &buf[0];
char *tokens[MTD_PARAM_MAX_COUNT], *token; // 最大支持4个参数
......
for (i = 0; i < MTD_PARAM_MAX_COUNT; i++) // 分割","参数
tokens[i] = strsep(&pbuf, ",");
p = &mtd_dev_param[mtd_devs]; // 获取全局参数存储缓存
strcpy(&p->name[0], tokens[0]); // 第一个参数为mtd名或mtd序号或者mtd设备节点
token = tokens[1]; // 获取vid_hdr_offs配置,默认为0
if (token) {
p->vid_hdr_offs = bytes_str_to_int(token);
......
}
token = tokens[2]; // 获取max_beb_per1024配置
if (token) {
int err = kstrtoint(token, 10, &p->max_beb_per1024);
......
}
token = tokens[3]; // 获取指定的ubi设备号,无则自动
if (token) {
int err = kstrtoint(token, 10, &p->ubi_num);
......
} else
p->ubi_num = UBI_DEV_NUM_AUTO;
mtd_devs += 1;
return 0;
}
attach mtd设备
ubi_init 函数
// drivers/mtd/ubi/build.c
static int __init ubi_init(void)
{
......
/* Create base sysfs directory and sysfs files */
err = class_register(&ubi_class);
if (err < 0)
return err;
// 注册ubi_ctrl设备
err = misc_register(&ubi_ctrl_cdev);
if (err) {
pr_err("UBI error: cannot register device\n");
goto out;
}
......
/* Attach MTD devices */
for (i = 0; i < mtd_devs; i++) {
struct mtd_dev_param *p = &mtd_dev_param[i];
struct mtd_info *mtd;
cond_resched();
// 根据name,找到对应的 mtd raw device
mtd = open_mtd_device(p->name);
......
// 绑定 mtd raw device
mutex_lock(&ubi_devices_mutex);
err = ubi_attach_mtd_dev(mtd, p->ubi_num,
p->vid_hdr_offs, p->max_beb_per1024);
mutex_unlock(&ubi_devices_mutex);
......
}
// 初始化ubiblock 设备
err = ubiblock_init();
if (err) {
pr_err("UBI error: block: cannot initialize, error %d\n", err);
/* See comment above re-ubi_is_module(). */
if (ubi_is_module())
goto out_detach;
}
return 0;
......
}
ubi_attach_mtd_dev 函数
在 open_mtd_device 找到需要绑定的 mtd_info后,将通过 ubi_attach_mtd_dev 函数将 mtd dev 与 ubi dev进行绑定。
io子系统绑定——io_init
根据mtd子系统提供的分区属性对UBI层的基本IO属性进行初始化,如下:
// drivers/mtd/ubi/build.c
static int io_init(struct ubi_device *ubi, int max_beb_per1024)
{
......
// 配置ped大小和总数,flash大小
ubi->peb_size = ubi->mtd->erasesize;
ubi->peb_count = mtd_div_by_eb(ubi->mtd->size, ubi->mtd);
ubi->flash_size = ubi->mtd->size;
// 计算容忍bad的数量
// (pebs/1024 + pebs%1024)*max_beb_per1024
if (mtd_can_have_bb(ubi->mtd)) {
ubi->bad_allowed = 1;
ubi->bad_peb_limit = get_bad_peb_limit(ubi, max_beb_per1024);
}
......
// 配置ubi的 min_io_size 为mtd的 writesize
// norflash:1
// nandflash:page_size
ubi->min_io_size = ubi->mtd->writesize;
ubi->hdrs_min_io_size = ubi->mtd->writesize >> ubi->mtd->subpage_sft;
......
/* Calculate default aligned sizes of EC and VID headers */
// 配置 EC、VID的大小
// ec_hdr_alsize :norflash---64,nandflash---page_size
// vid_hdr_alsize:norflash---64,nandflash---page_size
ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size);
ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size);
......
// 配置 vid的偏移起始地址
if (ubi->vid_hdr_offset == 0)
/* Default offset */
ubi->vid_hdr_offset = ubi->vid_hdr_aloffset =
ubi->ec_hdr_alsize;
else {
ubi->vid_hdr_aloffset = ubi->vid_hdr_offset &
~(ubi->hdrs_min_io_size - 1);
ubi->vid_hdr_shift = ubi->vid_hdr_offset -
ubi->vid_hdr_aloffset;
}
/* Similar for the data offset */
// 配置led的偏移起始地址
// leb_start:norflash---64,nandflash---2*page
ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE;
ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size);
......
// 最大允许读出错的物理可擦除快,通常为总数的10%且不小于16
ubi->max_erroneous = ubi->peb_count / 10;
if (ubi->max_erroneous < 16)
ubi->max_erroneous = 16;
......
// 计算leb大小
ubi->leb_size = ubi->peb_size - ubi->leb_start;
if (!(ubi->mtd->flags & MTD_WRITEABLE)) {
ubi_msg(ubi, "MTD device %d is write-protected, attach in read-only mode",
ubi->mtd->index);
ubi->ro_mode = 1;
}
......
return 0;
}
扫描PEB——scan_all
获得基本的属性后, 需扫描 NANDFLASH 所有的可擦除块以获得全局信息。UBI 管理的 NANDFLASH 在每一个可擦除块的头部保存了该可擦除块的属性和映射等信息,所以只需对每一个可擦除块读取一小部分数据即可。
在全局扫描前先创建一个临时的 struct ubi_attach_info 类型的扫描数据结构,扫描所有 PEB,并将每一个扫描到的可擦除块的各类信息实例化到 ubi_attach_info中。
对于每一个 PEB,首先需调用 MTD 层的接口函数 mtd_block_isbad 判断是否是坏块,如果是坏块,增加 ubi_attach_info 中的坏块统计计数 bad_peb_count,如果不是坏块则继续获得可擦除块头 EC 和卷 ID 头 VID。
EC 是 UBI 用于管理各可擦除块擦除次数和负载均衡的数据结构,其位于各PEB 的头部,其分布如下图所示:
struct ubi_ec_hdr {
__be32 magic; // Erase counter header magic number (ASCII "UBI#")
__u8 version; // version of UBI implementation which is supposed to accept this UBI image
__u8 padding1[3]; // reserved for future, zeroes
__be64 ec; // the erase counter /* Warning: the current limit is 31-bit anyway! */
__be32 vid_hdr_offset; // where the VID header starts
__be32 data_offset; // where the user data start
__be32 image_seq; // image sequence number
__u8 padding2[32]; // reserved for future, zeroes
__be32 hdr_crc; // erase counter header CRC checksum
} __packed;
VID(volume id)是 UBI 用与保存可擦除块和逻辑卷之间映射关系的数据结构,其位于 FLASH 各擦除块的 vid_hdr_aloffset 处,其分布如下图所示:
struct ubi_vid_hdr {
__be32 magic; // Volume identifier header magic number (ASCII "UBI!")
__u8 version; // version of UBI implementation which is supposed to accept this UBI image (%UBI_VERSION)
__u8 vol_type; // volume type (%UBI_VID_DYNAMIC or %UBI_VID_STATIC)
__u8 copy_flag; // if this logical eraseblock was copied from another physical eraseblock (for wear-leveling reasons)
__u8 compat; // compatibility of this volume (%0, %UBI_COMPAT_DELETE,%UBI_COMPAT_IGNORE, %UBI_COMPAT_PRESERVE, or %UBI_COMPAT_REJECT)
__be32 vol_id; // ID of this volume
__be32 lnum; // ogical eraseblock number
__u8 padding1[4];// reserved for future, zeroes
__be32 data_size; // how many bytes of data this logical eraseblock contains
__be32 used_ebs; // total number of used logical eraseblocks in this volume
__be32 data_pad; // how many bytes at the end of this physical eraseblock are not used
__be32 data_crc; // CRC checksum of the data stored in this logical eraseblock
__u8 padding2[4];// reserved for future, zeroes
__be64 sqnum; // sequence number
__u8 padding3[12];// reserved for future, zeroes
__be32 hdr_crc; // volume identifier header CRC checksum
} __packed;
-
读取EC Head——ubi_io_read_ec_hdr()
//drivers/mtd/ubi/io.c /** * ubi_io_read_ec_hdr - read and check an erase counter header. * @ubi: UBI device description object * @pnum: physical eraseblock to read from * @ec_hdr: a &struct ubi_ec_hdr object where to store the read erase counter * header * @verbose: be verbose if the header is corrupted or was not found * */ int ubi_io_read_ec_hdr(struct ubi_device *ubi, int pnum, struct ubi_ec_hdr *ec_hdr, int verbose) { int err, read_err; uint32_t crc, magic, hdr_crc; dbg_io("read EC header from PEB %d", pnum); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); read_err = ubi_io_read(ubi, ec_hdr, pnum, 0, UBI_EC_HDR_SIZE); ...... magic = be32_to_cpu(ec_hdr->magic); if (magic != UBI_EC_HDR_MAGIC) { if (mtd_is_eccerr(read_err)) return UBI_IO_BAD_HDR_EBADMSG; ...... } crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC); hdr_crc = be32_to_cpu(ec_hdr->hdr_crc); if (hdr_crc != crc) { if (verbose) { ubi_warn(ubi, "bad EC header CRC at PEB %d, calculated %#08x, read %#08x", pnum, crc, hdr_crc); ubi_dump_ec_hdr(ec_hdr); } dbg_bld("bad EC header CRC at PEB %d, calculated %#08x, read %#08x", pnum, crc, hdr_crc); if (!read_err) return UBI_IO_BAD_HDR; else return UBI_IO_BAD_HDR_EBADMSG; } /* And of course validate what has just been read from the media */ err = validate_ec_hdr(ubi, ec_hdr); if (err) { ubi_err(ubi, "validation failed for PEB %d", pnum); return -EINVAL; } /* * If there was %-EBADMSG, but the header CRC is still OK, report about * a bit-flip to force scrubbing on this PEB. */ return read_err ? UBI_IO_BITFLIPS : 0; }
-
读取VID Head——ubi_io_read_vid_hdr()
//drivers/mtd/ubi/io.c /** * ubi_io_read_vid_hdr - read and check a volume identifier header. * @ubi: UBI device description object * @pnum: physical eraseblock number to read from * @vidb: the volume identifier buffer to store data in * @verbose: be verbose if the header is corrupted or wasn't found * */ int ubi_io_read_vid_hdr(struct ubi_device *ubi, int pnum, struct ubi_vid_io_buf *vidb, int verbose) { int err, read_err; uint32_t crc, magic, hdr_crc; struct ubi_vid_hdr *vid_hdr = ubi_get_vid_hdr(vidb); void *p = vidb->buffer; dbg_io("read VID header from PEB %d", pnum); ubi_assert(pnum >= 0 && pnum < ubi->peb_count); read_err = ubi_io_read(ubi, p, pnum, ubi->vid_hdr_aloffset, ubi->vid_hdr_shift + UBI_VID_HDR_SIZE); ...... magic = be32_to_cpu(vid_hdr->magic); if (magic != UBI_VID_HDR_MAGIC) { ...... } crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC); hdr_crc = be32_to_cpu(vid_hdr->hdr_crc); if (hdr_crc != crc) { ...... } err = validate_vid_hdr(ubi, vid_hdr); if (err) { ubi_err(ubi, "validation failed for PEB %d", pnum); return -EINVAL; } return read_err ? UBI_IO_BITFLIPS : 0; }
返回值 | 含义 |
---|---|
UBI_IO_FF | 该PEB的数据全部为0xFF,则为空闲PEB |
UBI_IO_FF_BITFLIPS | 与UBI_IO_FF类似,但是在读取过程中发生了 bit 翻转 |
UBI_IO_BAD_HDR | EC或VID的magic或CRC出错 |
UBI_IO_BAD_HDR_EBADMSG | 与UBI_IO_BAD_HDR类似,但是在读取过程中发生了 bit 翻转 |
UBI_IO_BITFLIPS | 发生了bit翻转,但需进一步评估PEB |
- 扫描PEB——scan_peb
//drivers/mtd/ubi/attach.c
static int scan_peb(struct ubi_device *ubi, struct ubi_attach_info *ai,
int pnum, bool fast)
{
struct ubi_ec_hdr *ech = ai->ech;
struct ubi_vid_io_buf *vidb = ai->vidb;
struct ubi_vid_hdr *vidh = ubi_get_vid_hdr(vidb);
long long ec;
int err, bitflips = 0, vol_id = -1, ec_err = 0;
dbg_bld("scan PEB %d", pnum);
/* Skip bad physical eraseblocks */
// 判断当前PEB是否为bad,是则bad_peb_count+1
err = ubi_io_is_bad(ubi, pnum);
if (err < 0)
return err;
else if (err) {
ai->bad_peb_count += 1;
return 0;
}
// 读取 EC Head
err = ubi_io_read_ec_hdr(ubi, pnum, ech, 0);
if (err < 0)
return err;
switch (err) {
case 0:
break;
case UBI_IO_BITFLIPS: // 发生了bit翻转
bitflips = 1;
break;
// EC全为0xff,空闲PEB加1,并将当前PEB添加到erase链表尾
case UBI_IO_FF:
ai->empty_peb_count += 1;
return add_to_list(ai, pnum, UBI_UNKNOWN, UBI_UNKNOWN,
UBI_UNKNOWN, 0, &ai->erase);
// EC全为0xff,但发生了翻转,空闲PEB加1,并将当前PEB添加到erase链表头
case UBI_IO_FF_BITFLIPS:
ai->empty_peb_count += 1;
return add_to_list(ai, pnum, UBI_UNKNOWN, UBI_UNKNOWN,
UBI_UNKNOWN, 1, &ai->erase);
// EC数据出错,需根据VID再行判断
case UBI_IO_BAD_HDR_EBADMSG:
case UBI_IO_BAD_HDR:
ec_err = err;
ec = UBI_UNKNOWN;
bitflips = 1;
break;
default:
ubi_err(ubi, "'ubi_io_read_ec_hdr()' returned unknown code %d",
err);
return -EINVAL;
}
......
/* OK, we've done with the EC header, let's look at the VID header */
// 读取 VID Head
err = ubi_io_read_vid_hdr(ubi, pnum, vidb, 0);
if (err < 0)
return err;
switch (err) {
case 0:
break;
case UBI_IO_BITFLIPS:
bitflips = 1;
break;
case UBI_IO_BAD_HDR_EBADMSG:
// 若EC也一样的错误,该PEB疑似bad
if (ec_err == UBI_IO_BAD_HDR_EBADMSG)
ai->maybe_bad_peb_count += 1;
/* fall through */
case UBI_IO_BAD_HDR:
if (fast)
ai->force_full_scan = 1;
if (ec_err)
err = 0;
else
err = check_corruption(ubi, vidh, pnum);
if (err < 0)
return err;
else if (!err)
/* This corruption is caused by a power cut */
err = add_to_list(ai, pnum, UBI_UNKNOWN,
UBI_UNKNOWN, ec, 1, &ai->erase);
else
/* This is an unexpected corruption */
err = add_corrupted(ai, pnum, ec);
if (err)
return err;
goto adjust_mean_ec;
// 全为0xff,且发生了bit翻转,将PEB添加到erase链表头
case UBI_IO_FF_BITFLIPS:
err = add_to_list(ai, pnum, UBI_UNKNOWN, UBI_UNKNOWN,
ec, 1, &ai->erase);
if (err)
return err;
goto adjust_mean_ec;
case UBI_IO_FF:
// EC错误或者翻身了翻转,将PEB添加到erase链表头
if (ec_err || bitflips)
err = add_to_list(ai, pnum, UBI_UNKNOWN,
UBI_UNKNOWN, ec, 1, &ai->erase);
else
err = add_to_list(ai, pnum, UBI_UNKNOWN,
UBI_UNKNOWN, ec, 0, &ai->free);
if (err)
return err;
goto adjust_mean_ec;
default:
ubi_err(ubi, "'ubi_io_read_vid_hdr()' returned unknown code %d",
err);
return -EINVAL;
}
......
if (ubi_is_fm_vol(vol_id))
err = add_fastmap(ai, pnum, vidh, ec);
else
err = ubi_add_to_av(ubi, ai, pnum, ec, vidh, bitflips);
......
return 0;
}
static int scan_all(struct ubi_device *ubi, struct ubi_attach_info *ai,
int start)
{
int err, pnum;
struct rb_node *rb1, *rb2;
struct ubi_ainf_volume *av;
struct ubi_ainf_peb *aeb;
err = -ENOMEM;
ai->ech = kzalloc(ubi->ec_hdr_alsize, GFP_KERNEL);
if (!ai->ech)
return err;
ai->vidb = ubi_alloc_vid_buf(ubi, GFP_KERNEL);
if (!ai->vidb)
goto out_ech;
// 扫描当前ubi设备所有的PEB
for (pnum = start; pnum < ubi->peb_count; pnum++) {
cond_resched();
dbg_gen("process PEB %d", pnum);
err = scan_peb(ubi, ai, pnum, false);
if (err < 0)
goto out_vidh;
}
ubi_msg(ubi, "scanning is finished");
/* Calculate mean erase counter */
if (ai->ec_count)
ai->mean_ec = div_u64(ai->ec_sum, ai->ec_count);
// 处理全局信息
err = late_analysis(ubi, ai);
if (err)
goto out_vidh;
/*
* In case of unknown erase counter we use the mean erase counter
* value.
*/
// 更新根、空闲、过期、擦除等队列的平均擦除次数
ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) {
ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb)
if (aeb->ec == UBI_UNKNOWN)
aeb->ec = ai->mean_ec;
}
list_for_each_entry(aeb, &ai->free, u.list) {
if (aeb->ec == UBI_UNKNOWN)
aeb->ec = ai->mean_ec;
}
list_for_each_entry(aeb, &ai->corr, u.list)
if (aeb->ec == UBI_UNKNOWN)
aeb->ec = ai->mean_ec;
list_for_each_entry(aeb, &ai->erase, u.list)
if (aeb->ec == UBI_UNKNOWN)
aeb->ec = ai->mean_ec;
err = self_check_ai(ubi, ai);
if (err)
goto out_vidh;
ubi_free_vid_buf(ai->vidb);
kfree(ai->ech);
return 0;
out_vidh:
ubi_free_vid_buf(ai->vidb);
out_ech:
kfree(ai->ech);
return err;
}
逻辑卷管理——ubi_read_volume_table
UBI 的逻辑卷有静态卷和动态卷两种,其主要区别是卷的大小是否可以动态 伸缩。在每一个物理可擦除块 VID 头的 volume_type 域对其所属的逻辑卷进行了描述。
从另一个维度看,逻辑卷分为内部卷和普通卷两种,内部卷一般是 UBI 的管理卷,其卷号 vol_id 范围在0x7FFFFFFF - 4096 到 0x7FFFFFFF 之间,而普通卷一般由 UBIFS 使用,是标准意义上的逻辑卷,其 vol_id 的范围在 1 到 0x7FFFFFFF –4096 之间。
UBI 的内部卷有 layout volume、 fastmap superblock volume、 fastmap data volume等多种类型,但是只有 layout volume 是必须的,其余都是服务于特殊功能的特定逻辑卷, 暂不详述。
layout 卷是用于描述所有普通卷的信息,其 vol_id 为(0x7FFFFFFF - 4096),每一个普通卷的描述信息构成了 layout 卷的内容,每一个普通卷都由数据结构 struct ubi_vtbl_record 描述,下图展示了其分布情况:
如上图所示,在扫描完成后,属于 layout volume 的 PEB被标识出来。因为 layout volume 十分重要, UBI 子系统使用有两个 PEB 同时保存同样的内容以便确保 layout volume 的数据可靠性,在更新 layout volume 时,使用如下顺序更新:
* a. erase LEB 0;
* b. write new data to LEB 0;
* c. erase LEB 1;
* d. write new data to LEB 1.
在 UBI 子系统扫描完成后,会对两个存储 layout volume 的 PEB 的数据部分也就是 LEB0 和 LEB1 进行校验,如果两个 LEB 内容一致,则通过,如果 LEB0 的内容出错,则使用 LEB1 的内容恢复 LEB0 的数据,如果两个 LEB 的数据都出错则 UBI attach MTD 失败。
layout volume 的每一个 PEB 的内容都是由多个 ubi_vtbl_record 数据结构组成的 volume_table,每一个 ubi_vtbl_record 描述了一个 volume。其中 reserved_pebs描述了为该 volume 保留的最大的 PEB 个数,因为有些 volume 是可动态伸缩的,所以不能使用一个固定的 PEB 来描述; vol_type 域描述了此 volume 是动态卷还是静态卷; name_len 和 name 数组描述了该 volume 的名字,此外 ubi_vtbl_record 数据由 CRC 保护,该 CRC 值保存在 crc 域中以便读出时校验数据正确性。
//drivers/mtd/ubi/vtbl.c
int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_attach_info *ai)
{
int i, err;
struct ubi_ainf_volume *av;
empty_vtbl_record.crc = cpu_to_be32(0xf116c36b);
/*
* The number of supported volumes is limited by the eraseblock size
* and by the UBI_MAX_VOLUMES constant.
*/
ubi->vtbl_slots = ubi->leb_size / UBI_VTBL_RECORD_SIZE;
if (ubi->vtbl_slots > UBI_MAX_VOLUMES)
ubi->vtbl_slots = UBI_MAX_VOLUMES;
ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE;
ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size);
av = ubi_find_av(ai, UBI_LAYOUT_VOLUME_ID);
if (!av) {
// 当前ubi为空,则直接建立卷表
if (ai->is_empty) {
ubi->vtbl = create_empty_lvol(ubi, ai);
if (IS_ERR(ubi->vtbl))
return PTR_ERR(ubi->vtbl);
} else {
ubi_err(ubi, "the layout volume was not found");
return -EINVAL;
}
} else {
if (av->leb_count > UBI_LAYOUT_VOLUME_EBS) {
/* This must not happen with proper UBI images */
ubi_err(ubi, "too many LEBs (%d) in layout volume",
av->leb_count);
return -EINVAL;
}
// 获取LEB0和LEB1,建立卷表
ubi->vtbl = process_lvol(ubi, ai, av);
if (IS_ERR(ubi->vtbl))
return PTR_ERR(ubi->vtbl);
}
// 计算当前ubi可用的PEB总数
ubi->avail_pebs = ubi->good_peb_count - ubi->corr_peb_count;
/*
* The layout volume is OK, initialize the corresponding in-RAM data
* structures.
*/
// 初始化卷信息
err = init_volumes(ubi, ai, ubi->vtbl);
if (err)
goto out_free;
err = check_attaching_info(ubi, ai);
if (err)
goto out_free;
return 0;
out_free:
vfree(ubi->vtbl);
for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) {
ubi_fastmap_destroy_checkmap(ubi->volumes[i]);
kfree(ubi->volumes[i]);
ubi->volumes[i] = NULL;
}
return err;
}
WL初始化——ubi_wl_init
WL 子系统为每个 PEB 都实例化一个管理数据结构 struct ubi_wl_entry,该数据结构中的 ec 域用于描述其 PEB 的擦除次数, pnum 域描述其所属的 PEB 的 ID号,其内部的联合体 u 用于将该 ubi_wl_entry 与各类管理数据结构建立连接。
ubi_device 中也有多个容器用于管理 ubi_wl_entry,对于使用中的所有 PEB,都在 used 红黑树中有对应的 ubi_wl_entry 节点;对于读过程中出错的 PEB,都在errnoeous 红黑树中有相应的 ubi_wl_entry 节点;对于各 volume 中使用中的但是读出现 bit 翻转情况的 PEB,在 scrub 红黑树中有相应的节点;对于空闲的 PEB,都在 free rb 树中有相应的 ubi_wl_entry 节点。此外 UBI 还创建了一个元素类型为ubi_wl_entry 指针类型的数组,可通过 PEB ID 号作为索引快速找到相应的ubi_wl_entry 节点。需要说明的是,需要擦除的 PEB 并没有在上述管理容器中,其擦除完成后会暂时存放于保护队列 pq 中,后续在负载均衡处继续分析。
WL 初始化过程中,将 ubi_attach_info 数据中的 free 队列上的所有 PEB 创建的 ubi_wl_entry 节点添加到 free 红黑树中;将各 volume 实际使用的所有 PEB 添加到 used 红黑树或 scrub 红黑树中;将所有 erase 队列上的 PEB 提交擦除。
//drivers/mtd/ubi/wl.c
/**
* ubi_wl_init - initialize the WL sub-system using attaching information.
* @ubi: UBI device description object
* @ai: attaching information
*
* This function returns zero in case of success, and a negative error code in
* case of failure.
*/
int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
{
int err, i, reserved_pebs, found_pebs = 0;
struct rb_node *rb1, *rb2;
struct ubi_ainf_volume *av;
struct ubi_ainf_peb *aeb, *tmp;
struct ubi_wl_entry *e;
......
sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num);
err = -ENOMEM;
// 为每个PEB实例化一个struct ubi_wl_entry
ubi->lookuptbl = kcalloc(ubi->peb_count, sizeof(void *), GFP_KERNEL);
if (!ubi->lookuptbl)
return err;
for (i = 0; i < UBI_PROT_QUEUE_LEN; i++)
INIT_LIST_HEAD(&ubi->pq[i]);
ubi->pq_head = 0;
ubi->free_count = 0;
// 将erase链表中的PEB进行擦除,并创建ubi_wl_entry添加到free红黑树中
list_for_each_entry_safe(aeb, tmp, &ai->erase, u.list) {
cond_resched();
err = erase_aeb(ubi, aeb, false);
if (err)
goto out_free;
found_pebs++;
}
// 为free链表上的PEB创建ubi_wl_entry添加到free红黑树中
list_for_each_entry(aeb, &ai->free, u.list) {
cond_resched();
e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);
if (!e) {
err = -ENOMEM;
goto out_free;
}
e->pnum = aeb->pnum;
e->ec = aeb->ec;
ubi_assert(e->ec >= 0);
wl_tree_add(e, &ubi->free);
ubi->free_count++;
ubi->lookuptbl[e->pnum] = e;
found_pebs++;
}
// 将各个volumes使用到的PEB创建ubi_wl_entry,添加到used或scrub的红黑树中
ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) {
ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb) {
cond_resched();
e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);
if (!e) {
err = -ENOMEM;
goto out_free;
}
e->pnum = aeb->pnum;
e->ec = aeb->ec;
ubi->lookuptbl[e->pnum] = e;
if (!aeb->scrub) {
dbg_wl("add PEB %d EC %d to the used tree",
e->pnum, e->ec);
wl_tree_add(e, &ubi->used);
} else {
dbg_wl("add PEB %d EC %d to the scrub tree",
e->pnum, e->ec);
wl_tree_add(e, &ubi->scrub);
}
found_pebs++;
}
}
......
dbg_wl("found %i PEBs", found_pebs);
ubi_assert(ubi->good_peb_count == found_pebs);
// 需要额外保留一个PEB用于WL子系统
reserved_pebs = WL_RESERVED_PEBS;
......
ubi->avail_pebs -= reserved_pebs;
ubi->rsvd_pebs += reserved_pebs;
/* Schedule wear-leveling if needed */
err = ensure_wear_leveling(ubi, 0);
if (err)
goto out_free;
......
}
EBA初始化——ubi_eba_init
EBA(Eraseblock Association)可擦除块绑定,主要负责各逻辑可擦除块和物理可擦除块间映射关系的子系统。
NANDFLASH 在写操作时需先进行整块擦除,即便是只需要写几个字节也要将这个可擦除块全部擦除,此时如果发生异常断电可能导致部分数据丢失,此外NANDFLASH 的擦除块在使用过程中也有可能变成坏块而无法恢复,导致数据永久性的丢失,这些复杂情况是 NANDFLASH 上运行的文件系统与其他硬盘文件系统的关键差异。 UBI 层最核心的思想就是抽象出逻辑可擦除块的概念,对上屏蔽NANDFLASH 的特殊性从而呈现出线性的逻辑存储空间,对下管理各物理可擦除块,均衡使用各可擦除块。 同时除了标准的读写操作外,还可对一些关键数据提供原子化的写操作,即先在一个新的可擦除块中完成写操作,然后在动态修改逻辑可擦除块与物理可擦除块的映射关系,即便在写的过程中出现了异常断电,原始数据仍未丢失,从而大大提高了文件系统应对异常断电的能力。
volume 包括了一系列逻辑可擦除块,其与物理可擦除块的映射关系存放在struct ubi_volume 数据结构的 eba_tbl 处,如下图所示:
eba_tbl 其实质上是一个 int 型数组,数组的索引为逻辑可擦除块 LEB 的编号,数组中的值为物理可擦除块 PEB 的编号,对于编号为-1 的情况表征当前逻辑可擦除块还没有被映射到具体的物理可擦除块上。各数组中元素值映射关系可在扫描过程中建立的各 volume 管理的 LEB 红黑树的节点数据中获得。
另外需要至少额外保留至少一个 PEB 用于 EBA 子系统,所以在建立完各volume 的 EBA 后,需全局减少一个可擦除数量并增加一个保留的可擦除块数量。
//drivers/mtd/ubi/wl.c
/**
* ubi_eba_init - initialize the EBA sub-system using attaching information.
* @ubi: UBI device description object
* @ai: attaching information
*
* This function returns zero in case of success and a negative error code in
* case of failure.
*/
int ubi_eba_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
{
......
num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT;
for (i = 0; i < num_volumes; i++) {
struct ubi_eba_table *tbl;
vol = ubi->volumes[i];
if (!vol)
continue;
cond_resched();
// 创建eba_tbl,并初始化保留块的pnum为UBI_LEB_UNMAPPED
tbl = ubi_eba_create_table(vol, vol->reserved_pebs);
if (IS_ERR(tbl)) {
err = PTR_ERR(tbl);
goto out_free;
}
// 更新vol的eba_tbl为新的tbl
ubi_eba_replace_table(vol, tbl);
av = ubi_find_av(ai, idx2vol_id(ubi, i));
if (!av)
continue;
ubi_rb_for_each_entry(rb, aeb, &av->root, u.rb) {
// 将大于保留PEBS的lnum添加到erase链表中
if (aeb->lnum >= vol->reserved_pebs) {
ubi_move_aeb_to_list(av, aeb, &ai->erase);
} else { // 其他的LEB则建立其余PEB的关系
struct ubi_eba_entry *entry;
entry = &vol->eba_tbl->entries[aeb->lnum];
entry->pnum = aeb->pnum;
}
}
}
// 需要至少额外保留一个PEB用于EBA子系统
if (ubi->avail_pebs < EBA_RESERVED_PEBS) {
ubi_err(ubi, "no enough physical eraseblocks (%d, need %d)",
ubi->avail_pebs, EBA_RESERVED_PEBS);
if (ubi->corr_peb_count)
ubi_err(ubi, "%d PEBs are corrupted and not used",
ubi->corr_peb_count);
err = -ENOSPC;
goto out_free;
}
ubi->avail_pebs -= EBA_RESERVED_PEBS;
ubi->rsvd_pebs += EBA_RESERVED_PEBS;
// nandflash重新更新相关的PEBS
if (ubi->bad_allowed) {
ubi_calculate_reserved(ubi);
// 当前可用快小于管理所需预留的怀块数,则触发警告
if (ubi->avail_pebs < ubi->beb_rsvd_level) {
/* No enough free physical eraseblocks */
ubi->beb_rsvd_pebs = ubi->avail_pebs;
print_rsvd_warning(ubi, ai);
} else
ubi->beb_rsvd_pebs = ubi->beb_rsvd_level;
ubi->avail_pebs -= ubi->beb_rsvd_pebs;
ubi->rsvd_pebs += ubi->beb_rsvd_pebs;
}
......
}
重配卷大小——autoresize
//drivers/mtd/ubi/build.c
/**
* autoresize - re-size the volume which has the "auto-resize" flag set.
* @ubi: UBI device description object
* @vol_id: ID of the volume to re-size
*
* This function re-sizes the volume marked by the %UBI_VTBL_AUTORESIZE_FLG in
* the volume table to the largest possible size. See comments in ubi-header.h
* for more description of the flag. Returns zero in case of success and a
* negative error code in case of failure.
*/
static int autoresize(struct ubi_device *ubi, int vol_id)
{
struct ubi_volume_desc desc;
struct ubi_volume *vol = ubi->volumes[vol_id];
int err, old_reserved_pebs = vol->reserved_pebs;
if (ubi->ro_mode) {
ubi_warn(ubi, "skip auto-resize because of R/O mode");
return 0;
}
/*
* Clear the auto-resize flag in the volume in-memory copy of the
* volume table, and 'ubi_resize_volume()' will propagate this change
* to the flash.
*/
ubi->vtbl[vol_id].flags &= ~UBI_VTBL_AUTORESIZE_FLG;
// 没有可使用的PEB
if (ubi->avail_pebs == 0) {
struct ubi_vtbl_record vtbl_rec;
/*
* No available PEBs to re-size the volume, clear the flag on
* flash and exit.
*/
vtbl_rec = ubi->vtbl[vol_id];
err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
if (err)
ubi_err(ubi, "cannot clean auto-resize flag for volume %d",
vol_id);
} else {
desc.vol = vol;
// 新的保留块为:vol->reserved_pebs + ubi->avail_pebs
err = ubi_resize_volume(&desc,
old_reserved_pebs + ubi->avail_pebs);
if (err)
ubi_err(ubi, "cannot auto-resize volume %d",
vol_id);
}
if (err)
return err;
ubi_msg(ubi, "volume %d (\"%s\") re-sized from %d to %d LEBs",
vol_id, vol->name, old_reserved_pebs, vol->reserved_pebs);
return 0;
}
//drivers/mtd/ubi/vmt.c
/**
* ubi_resize_volume - re-size volume.
* @desc: volume descriptor
* @reserved_pebs: new size in physical eraseblocks
*
* This function re-sizes the volume and returns zero in case of success, and a
* negative error code in case of failure. The caller has to have the
* @ubi->device_mutex locked.
*/
int ubi_resize_volume(struct ubi_volume_desc *desc, int reserved_pebs)
{
int i, err, pebs;
struct ubi_volume *vol = desc->vol;
struct ubi_device *ubi = vol->ubi;
struct ubi_vtbl_record vtbl_rec;
struct ubi_eba_table *new_eba_tbl = NULL;
int vol_id = vol->vol_id;
if (ubi->ro_mode)
return -EROFS;
dbg_gen("re-size device %d, volume %d to from %d to %d PEBs",
ubi->ubi_num, vol_id, vol->reserved_pebs, reserved_pebs);
if (vol->vol_type == UBI_STATIC_VOLUME &&
reserved_pebs < vol->used_ebs) {
ubi_err(ubi, "too small size %d, %d LEBs contain data",
reserved_pebs, vol->used_ebs);
return -EINVAL;
}
/* If the size is the same, we have nothing to do */
if (reserved_pebs == vol->reserved_pebs)
return 0;
// 创建新的eba_tbl
new_eba_tbl = ubi_eba_create_table(vol, reserved_pebs);
if (IS_ERR(new_eba_tbl))
return PTR_ERR(new_eba_tbl);
spin_lock(&ubi->volumes_lock);
if (vol->ref_count > 1) {
spin_unlock(&ubi->volumes_lock);
err = -EBUSY;
goto out_free;
}
spin_unlock(&ubi->volumes_lock);
/* Reserve physical eraseblocks */
pebs = reserved_pebs - vol->reserved_pebs;
// 有足够多的保留PEBS,则copy其EBA
if (pebs > 0) {
spin_lock(&ubi->volumes_lock);
if (pebs > ubi->avail_pebs) {
ubi_err(ubi, "not enough PEBs: requested %d, available %d",
pebs, ubi->avail_pebs);
if (ubi->corr_peb_count)
ubi_err(ubi, "%d PEBs are corrupted and not used",
ubi->corr_peb_count);
spin_unlock(&ubi->volumes_lock);
err = -ENOSPC;
goto out_free;
}
ubi->avail_pebs -= pebs;
ubi->rsvd_pebs += pebs;
ubi_eba_copy_table(vol, new_eba_tbl, vol->reserved_pebs);
ubi_eba_replace_table(vol, new_eba_tbl);
spin_unlock(&ubi->volumes_lock);
}
// 没有足够多的保留PEBS,则解除LEB的映射,重新更新保留块的EBA
if (pebs < 0) {
for (i = 0; i < -pebs; i++) {
err = ubi_eba_unmap_leb(ubi, vol, reserved_pebs + i);
if (err)
goto out_acc;
}
spin_lock(&ubi->volumes_lock);
ubi->rsvd_pebs += pebs;
ubi->avail_pebs -= pebs;
ubi_update_reserved(ubi);
ubi_eba_copy_table(vol, new_eba_tbl, reserved_pebs);
ubi_eba_replace_table(vol, new_eba_tbl);
spin_unlock(&ubi->volumes_lock);
}
/*
* When we shrink a volume we have to flush all pending (erase) work.
* Otherwise it can happen that upon next attach UBI finds a LEB with
* lnum > highest_lnum and refuses to attach.
*/
if (pebs < 0) {
err = ubi_wl_flush(ubi, vol_id, UBI_ALL);
if (err)
goto out_acc;
}
/* Change volume table record */
vtbl_rec = ubi->vtbl[vol_id];
vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs);
err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
if (err)
goto out_acc;
vol->reserved_pebs = reserved_pebs;
if (vol->vol_type == UBI_DYNAMIC_VOLUME) {
vol->used_ebs = reserved_pebs;
vol->last_eb_bytes = vol->usable_leb_size;
vol->used_bytes =
(long long)vol->used_ebs * vol->usable_leb_size;
}
ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED);
self_check_volumes(ubi);
return err;
out_acc:
if (pebs > 0) {
spin_lock(&ubi->volumes_lock);
ubi->rsvd_pebs -= pebs;
ubi->avail_pebs += pebs;
spin_unlock(&ubi->volumes_lock);
}
out_free:
kfree(new_eba_tbl);
return err;
}