文章目录
1. 前言
限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。
2. Linux 存储设备分区
系统启动后,用 fdisk -l
命令去查看磁盘分区的情况,如:
$ sudo fdisk -l
Disk /dev/sda: 200 GiB, 214748364800 bytes, 419430400 sectors
Units: sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disklabel type: dos
Disk identifier: 0x54b75f22
Device Boot Start End Sectors Size Id Type
/dev/sda1 * 2048 411041791 411039744 196G 83 Linux
/dev/sda2 411041792 419430399 8388608 4G 5 Extended
/dev/sda5 411043840 419430399 8386560 4G 82 Linux swap / Solaris
我想大家会好奇,这些磁盘是怎么划定的?系统刚启动,用户还也没有去进行磁盘分区的手工操作,谁完成了这些工作?本文针对 嵌入式系统
环境,简要的对系统启动期间、按设备的组织形式、分区的设定方式不同、对几种比较典型的磁盘分区的建立工作做简要介绍。
2.1 MTD(Memory Technology Device) 类设备分区
2.1.1 通过 DTS 设定 MTD 类设备分区
以 TI(Texas Instruments)
的 AM335X
为例,说明 通过 DTS 设定创建磁盘分区
的简要流程。先看 DTS 配置:
gpmc: gpmc@50000000 {
compatible = "ti,am3352-gpmc";
...
nand@0,0 {
compatible = "ti,omap2-nand";
...
/* 通过 DTS 将 NAND 设备划分为 11 个分区 */
partition@0 {
label = "NAND.SPL";
reg = <0x00000000 0x000020000>;
};
partition@1 {
label = "NAND.SPL.backup1";
reg = <0x00020000 0x00020000>;
};
partition@2 {
label = "NAND.SPL.backup2";
reg = <0x00040000 0x00020000>;
};
partition@3 {
label = "NAND.SPL.backup3";
reg = <0x00060000 0x00020000>;
};
partition@4 {
label = "NAND.u-boot-spl-os";
reg = <0x00080000 0x00040000>;
};
partition@5 {
label = "NAND.u-boot";
reg = <0x000C0000 0x00100000>;
};
partition@6 {
label = "NAND.u-boot-env";
reg = <0x001C0000 0x00020000>;
};
partition@7 {
label = "NAND.u-boot-env.backup1";
reg = <0x001E0000 0x00020000>;
};
partition@8 {
label = "NAND.kernel";
reg = <0x00200000 0x00800000>;
};
partition@9 {
label = "NAND.rootfs";
reg = <0x00A00000 0x0D600000>;
};
partition@10 {
label = "NAND.userdata";
reg = <0x0E000000 0>;
};
};
};
通过 DTS 分区配置建立 MTD 存储设备分区流程如下:
omap_nand_probe() /* drivers/mtd/nand/raw/omap2.c */
...
err = mtd_device_register(mtd, NULL, 0); /* include/linux/mtd/mtd.h */
/* @parts = NULL, @nr_parts = 0 */
mtd_device_parse_register(master, NULL, NULL, parts, nr_parts) /* drivers/mtd/core.c */
ret = parse_mtd_partitions(mtd, types, parser_data);
...
/* 解析 MTD 设备 DTS 设定的分区 */
ret = mtd_part_of_parse(master, &pparts);
...
/* drivers/mtd/ofpart.c */
parse_fixed_partitions() /* DTS 分区解析 */
...
/* 注册 MTD 设备分区到系统 */
err = add_mtd_partitions(master, pparts.parts,
pparts.nr_parts);
...
printk(KERN_NOTICE "Creating %d MTD partitions on \"%s\":\n", nbparts, master->name);
for (i = 0; i < nbparts; i++) {
/* 创建并初始化 MTD 设备分区对象 (struct mtd_part) */
slave = allocate_partition(master, parts + i, i, cur_offset);
...
/*
* . 创建初始化分区 磁盘对象
* . 创建分区 /dev/mtdblockN, /dev/mtdN, /dev/mtdNro 设备
* ......
*/
ret = add_mtd_device(&slave->mtd);
...
}
/* 创建并初始化 MTD 设备分区对象 (struct mtd_part) */
slave = allocate_partition(master, parts + i, i, cur_offset); /* drivers/mtd/mtdpart.c */
....
struct mtd_part *slave;
...
slave = kzalloc(sizeof(*slave), GFP_KERNEL); /* 创建 MTD 分区对象 */
...
/* set up the MTD object for this partition */
slave->mtd.type = parent->type;
...
slave->mtd.size = part->size;
slave->mtd.writesize = parent->writesize;
slave->mtd.writebufsize = parent->writebufsize;
slave->mtd.oobsize = parent->oobsize;
slave->mtd.oobavail = parent->oobavail;
...
/* 设置 MTD 分区 的 操作接口 */
slave->mtd._read = part_read; /* 读接口 */
slave->mtd._write = part_write; /* 写接口 */
...
if (parent->_block_isbad)
slave->mtd._block_isbad = part_block_isbad; /* 坏块判定接口 */
if (parent->_block_markbad)
slave->mtd._block_markbad = part_block_markbad; /* 坏块标记接口 */
...
slave->mtd._erase = part_erase; /* 擦除接口 */
...
/*
* 打印 MTD 设备分区的 区间范围 和 名字:
* [ 1.684146] 0x000000000000-0x000000020000 : "NAND.SPL"
*/
printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", (unsigned long long)slave->offset,
(unsigned long long)(slave->offset + slave->mtd.size), slave->mtd.name);
/* OOB 空间 layout */
mtd_set_ooblayout(&slave->mtd, &part_ooblayout_ops);
...
if (parent->_block_isbad) {
uint64_t offs = 0;
/* 遍历整个 MTD 设备空间, 统计 BBT 保留块 和 坏块 数目 */
while (offs < slave->mtd.size)
if (mtd_block_isreserved(parent, offs + slave->offset)) /* 判定是否为 (用作 BBT 的) 保留块 */
slave->mtd.ecc_stats.bbtblocks++; /* 用作 BBT 的保留块数 +1 */
else if (mtd_block_isbad(parent, offs + slave->offset)) /* 判定是否为坏块 */
slave->mtd.ecc_stats.badblocks++; /* 坏块数 +1 */
offs += slave->mtd.erasesize;
}
}
/*
* . 创建初始化分区 磁盘对象
* . 创建分区 /dev/mtdblockN, /dev/mtdN, /dev/mtdNro 设备
* ......
*/
add_mtd_device() /* drivers/mtd/mtdcore.c */
...
/* 为 MTD 设备分配 minor 设备号 */
i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL);
...
mtd->index = i;
mtd->usecount = 0;
...
mtd->dev.type = &mtd_devtype;
mtd->dev.class = &mtd_class;
mtd->dev.devt = MTD_DEVT(i);
...
error = device_register(&mtd->dev); /* 注册 MTD 分区的字符设备 /dev/mtd%d */
...
list_for_each_entry(not, &mtd_notifiers, list)
not->add(mtd); /* blktrans_notify_add() */
blktrans_notify_add() /* drivers/mtd/mtd_blkdevs.c */
list_for_each_entry(tr, &blktrans_majors, list)
tr->add_mtd(tr, mtd) = mtdblock_add_mtd(tr, mtd) /* drivers/mtd/mtdblock.c */
/* 创建 MTD 块设备对象 */
struct mtdblk_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
...
dev->mbd.mtd = mtd; /* 设定关联的 MTD 设备对象 */
dev->mbd.devnum = mtd->index;
dev->mbd.size = mtd->size >> 9; /* 块设备按 512 扇区单位进行组织 */
/*
* 绑定 register_mtd_blktrans() 注册的 struct mtd_blktrans_ops:
* &mtdblock_tr, ...
*/
if (add_mtd_blktrans_dev(&dev->mbd))
kfree(dev);
dev->mbd.tr = tr;
add_mtd_blktrans_dev(&dev->mbd) /* drivers/mtd/mtd_blkdevs.c */
...
struct gendisk *gd;
...
list_add_tail(&new->list, &tr->devs);
...
gd = alloc_disk(1 << tr->part_bits);
...
new->disk = gd;
gd->private_data = new;
gd->major = tr->major;
gd->first_minor = (new->devnum) << tr->part_bits;
gd->fops = &mtd_block_ops;
...
/* Create processing workqueue */
new->wq = alloc_workqueue("%s%d", 0, 0,
tr->name, new->mtd->index);
...
INIT_WORK(&new->work, mtd_blktrans_work);
...
device_add_disk(&new->mtd->dev, gd);
...
dev_t devt;
...
retval = blk_alloc_devt(&disk->part0, &devt);
...
disk_to_dev(disk)->devt = devt;
...
disk->major = MAJOR(devt);
disk->first_minor = MINOR(devt);
...
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
register_disk(parent, disk);
blk_register_queue(disk);
...
...
上述过程,观察到如下内核日志:
[ 1.635286] omap-gpmc 50000000.gpmc: GPMC revision 6.0
[ 1.640473] gpmc_mem_init: disabling cs 0 mapped at 0x0-0x1000000
[ 1.648388] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xda
[ 1.654908] nand: Micron MT29F2G08AAD
[ 1.658589] nand: 256 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
[ 1.666252] nand: using OMAP_ECC_BCH8_CODE_HW ECC scheme
[ 1.671692] 11 fixed-partitions partitions found on MTD device omap2-nand.0
[ 1.678704] Creating 11 MTD partitions on "omap2-nand.0":
[ 1.684146] 0x000000000000-0x000000020000 : "NAND.SPL"
[ 1.690415] 0x000000020000-0x000000040000 : "NAND.SPL.backup1"
[ 1.697268] 0x000000040000-0x000000060000 : "NAND.SPL.backup2"
[ 1.704018] 0x000000060000-0x000000080000 : "NAND.SPL.backup3"
[ 1.710719] 0x000000080000-0x0000000c0000 : "NAND.u-boot-spl-os"
[ 1.717798] 0x0000000c0000-0x0000001c0000 : "NAND.u-boot"
[ 1.724932] 0x0000001c0000-0x0000001e0000 : "NAND.u-boot-env"
[ 1.731558] 0x0000001e0000-0x000000200000 : "NAND.u-boot-env.backup1"
[ 1.738956] 0x000000200000-0x000000a00000 : "NAND.kernel"
[ 1.752614] 0x000000a00000-0x00000e000000 : "NAND.rootfs"
[ 1.958515] 0x00000e000000-0x000010000000 : "NAND.userdata"
最后补充一下 register_mtd_blktrans()
注册 struct mtd_blktrans_ops
接口到链表 blktrans_notifier
和 MTD 设备事件通知对象 struct mtd_notifier
到链表 mtd_notifiers
的过程,这是前面确失的部分:
/* drivers/mtd/mtdblock.c */
static struct mtd_blktrans_ops mtdblock_tr = {
.name = "mtdblock",
.major = MTD_BLOCK_MAJOR,
.part_bits = 0,
.blksize = 512,
.open = mtdblock_open,
.flush = mtdblock_flush,
.release = mtdblock_release,
.readsect = mtdblock_readsect,
.writesect = mtdblock_writesect,
.add_mtd = mtdblock_add_mtd,
.remove_dev = mtdblock_remove_dev,
.owner = THIS_MODULE,
};
static int __init init_mtdblock(void)
{
return register_mtd_blktrans(&mtdblock_tr);
}
/* drivers/mtd/mtd_blkdevs.c */
int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
{
struct mtd_info *mtd;
...
/* Register the notifier if/when the first device type is
registered, to prevent the link/init ordering from fucking
us over. */
if (!blktrans_notifier.list.next)
register_mtd_user(&blktrans_notifier);
...
/* 添加到 全局 MTD 设备(增加、移除)消息 通知链表 */
list_add(&new->list, &mtd_notifiers);
...
...
list_add(&tr->list, &blktrans_majors);
...
}
2.1.2 通过 内核命令行 设定 MTD 类设备分区
除了上一小节提到的通过 DTS 指定 MTD 设备分区外,我们还可以通过 内核命令行参数 mtdparts=
来指定 MTD 设备的分区。如:
mtdparts=8000000.nand:128k(NAND.SPL),128k(NAND.SPL.backup1),128k(NAND.SPL.backup2),128k(NAND.SPL.backup3),256k(NAND.u-boot-spl-os),1M(NAND.u-boot),128k(NAND.u-boot-env),128k(NAND.u-boot-env.backup1),8M(NAND.kernel),214M(NAND.rootfs),32M(NAND.userdata)
除了解析接口变为了 parse_cmdline_partitions()
外,其它解析过程与上一小节中描述类似。假设我们仍然使用上一小节中的 OMAP NAND 设备,只是我们使用 内核命令行参数 mtdparts=
的形式,而不是 DTS 来定义设备分区 细节如下:
omap_nand_probe() /* drivers/mtd/nand/raw/omap2.c */
...
err = mtd_device_register(mtd, NULL, 0); /* include/linux/mtd/mtd.h */
/* @parts = NULL, @nr_parts = 0 */
mtd_device_parse_register(master, NULL, NULL, parts, nr_parts) /* drivers/mtd/core.c */
ret = parse_mtd_partitions(mtd, types, parser_data);
...
/* drivers/mtd/cmdlinepart.c */
parse_cmdline_partitions() /* 解析内核命令行 MTD 分区参数: mtdparts= */
...
/* 注册 MTD 设备分区到系统 */
err = add_mtd_partitions(master, pparts.parts,
pparts.nr_parts);
// 同前面的分析
2.1.3 小结
MTD (Memory Technology Device)
类设备分区,可通过 DTS
或 内核命令行参数 mtdparts=
两种方式进行设定。MTD 设备驱动调用 mtd_device_register()
将设备注册到系统期间,MTD 子系统通过 register_mtd_parser()
注册的 MTD 分区解析器,解析 MTD 的设备分区并注册到系统。
- 关联的
sysfs
目录:
/sys/bus/platform/devices/8000000.nand/mtd/*
/sys/class/mtd/*
- 关联的设备节点:
/dev/mtdN
/dev/mtdNro
/dev/mtdblockN
2.2 块(block) 类设备分区
2.2.1 块(block) 类设备分区解析
通过 块(block)设备驱动
进行管理的 块(block)类设备
的分区,不同于 MTD(Memory Technology Device)
类分区建立过程,它有着不同的方式。
以 Rockchip 的 SFC(Serial Flash Interface) 接口、烧写有 GPT 分区信息的 NAND Flash 分区的建立过程为例,来描述 块(block)类设备
的分区的建立过程。我们只重点关注平台无关的部分:
/* Rockchip 平台相关部分:不用在意 */
ret = rkflash_blk_register(&mytr);
...
ret = register_blkdev(blk_ops->major, blk_ops->name);
...
blk_ops->rq = blk_mq_init_sq_queue(blk_ops->tag_set, &rkflash_mq_ops, 1,
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
...
rkflash_blk_add_dev(dev, blk_ops, &part);
struct gendisk *gd;
gd = alloc_disk(1 << blk_ops->minorbits);
...
snprintf(gd->disk_name, sizeof(gd->disk_name),
"%s%d", blk_ops->name, dev->devnum); /* "rkflash0" */
...
add_disk(gd); /* 关注的重点 */
/* 平台无关部分 */
add_disk(gd) /* include/linux/genhd.h */
device_add_disk(NULL, disk, NULL); /* block/genhd.c */
...
register_disk(parent, disk);
...
bdev->bd_invalidated = 1;
err = blkdev_get(bdev, FMODE_READ, NULL);
...
res = __blkdev_get(bdev, mode, 0);
...
if (!bdev->bd_openers) {
...
if (!partno) {
...
if (bdev->bd_invalidated) {
if (!ret)
rescan_partitions(disk, bdev); // 见后续分析 (1)
...
}
...
}
...
}
...
...
...
...
// 接前面 (1) 处分析
rescan_partitions(disk, bdev); /* fs/block_dev.c */
...
/*
* 1. 解析 block 设备磁盘分区:
* check_partition() 调用 block/partitions/check.c 分区解析接口表
* @check_part[] 中各接口, 尝试解析 block 磁盘分区到 @state .
*/
if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) // 见后续分析 (2)
return 0;
...
/* add partitions */
/* 2. 添加 block 磁盘所有分区到系统 */
for (p = 1; p < state->limit; p++) {
...
part = add_partition(disk, p, from, size,
state->parts[p].flags,
&state->parts[p].info);
...
}
// 接前面 (2) 处分析
state = check_partition(disk, bdev) /* block/partitions/check.c */
struct parsed_partitions *state;
...
state = allocate_partitions(hd);
...
state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
state->pp_buf[0] = '\0';
state->bdev = bdev;
disk_name(hd, 0, state->name); /* 磁盘名称, 如 rkflash0 */
snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); /* @state->pp_buf: " rkflash0:" */
if (isdigit(state->name[strlen(state->name)-1]))
sprintf(state->name, "p"); /* @state->name: "p" */
/*
* 调用 block/partitions/check.c 分区解析接口表 @check_part[]
* 中各接口, 尝试解析 block 磁盘分区
*/
while (!res && check_part[i]) {
memset(state->parts, 0, state->limit * sizeof(state->parts[0]));
/* 假设 block 设备的 使用 GPT 分区 */
res = check_part[i++](state); /* efi_partition() */ // 见后续分析 (3)
...
}
...
if (res > 0) {
/*
* 打印 GPT 分区信息 内核日志:
* [ 0.513590] rkflash0: p1 p2 p3 p4 p5 p6
*/
printk(KERN_INFO "%s", state->pp_buf);
free_page((unsigned long)state->pp_buf);
return state;
}
...
// 接前面 (3) 处分析
res = check_part[i++](state);
efi_partition(state) /* block/partitions/efi.c */
...
if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
...
}
pr_debug("GUID Partition Table is valid! Yea!\n");
for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
...
put_partition(state, i+1, start * ssz, size * ssz); /* 提取 GPT 分区信息 */
...
}
到此,块(block)设备
GPT 类型分区的解析过程已经分析完毕。事实上,块设备支持很多类型的分区解析器:
static int (*check_part[])(struct parsed_partitions *) = {
...
#ifdef CONFIG_CMDLINE_PARTITION
/* block/partitions/cmdline.c: 内核命令行参数 "blkdevparts=" */
cmdline_partition,
#endif
#ifdef CONFIG_EFI_PARTITION
/* block/partitions/efi.c */
efi_partition, /* this must come before msdos */
#endif
#ifdef CONFIG_SGI_PARTITION
/* block/partitions/sgi.c */
sgi_partition,
#endif
#ifdef CONFIG_LDM_PARTITION
/* block/partitions/ldm.c */
ldm_partition, /* this must come before msdos */
#endif
#ifdef CONFIG_MSDOS_PARTITION
/* block/partitions/msdos.c */
msdos_partition, /* MS DOS 分区 */
#endif
...
NULL /* NULL 表示结尾 */
};
可以看到,块(block)设备
支持很多类型的分区,感兴趣的读者可自行查阅相关源码。
2.2.2 块设备 sysfs
/sys/class/block/*
/sys/devices/virtual/block/*
/dev/block/*