Linux: 启动阶段磁盘分区的建立过程

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. Linux 存储设备分区

系统启动后,用 fdisk -l 命令去查看磁盘分区的情况,如:

$ sudo fdisk -l
Disk /dev/sda: 200 GiB, 214748364800 bytes, 419430400 sectors
Units: sectors of 1 * 512 = 512 bytes
Sector size (logical/physical): 512 bytes / 512 bytes
I/O size (minimum/optimal): 512 bytes / 512 bytes
Disklabel type: dos
Disk identifier: 0x54b75f22

Device     Boot     Start       End   Sectors  Size Id Type
/dev/sda1  *         2048 411041791 411039744  196G 83 Linux
/dev/sda2       411041792 419430399   8388608    4G  5 Extended
/dev/sda5       411043840 419430399   8386560    4G 82 Linux swap / Solaris

我想大家会好奇,这些磁盘是怎么划定的?系统刚启动,用户还也没有去进行磁盘分区的手工操作,谁完成了这些工作?本文针对 嵌入式系统 环境,简要的对系统启动期间、按设备的组织形式、分区的设定方式不同、对几种比较典型的磁盘分区的建立工作做简要介绍。

2.1 MTD(Memory Technology Device) 类设备分区

2.1.1 通过 DTS 设定 MTD 类设备分区

TI(Texas Instruments)AM335X 为例,说明 通过 DTS 设定创建磁盘分区 的简要流程。先看 DTS 配置:

gpmc: gpmc@50000000 {
	compatible = "ti,am3352-gpmc";
	...
	nand@0,0 {
		compatible = "ti,omap2-nand";
		...
		/* 通过 DTS 将 NAND 设备划分为 11 个分区 */
		partition@0 {
			label = "NAND.SPL";
			reg = <0x00000000 0x000020000>;
		};
		partition@1 {
			label = "NAND.SPL.backup1";
			reg = <0x00020000 0x00020000>;
		};
		partition@2 {
			label = "NAND.SPL.backup2";
			reg = <0x00040000 0x00020000>;
		};
		partition@3 {
			label = "NAND.SPL.backup3";
			reg = <0x00060000 0x00020000>;
		};
		partition@4 {
			label = "NAND.u-boot-spl-os";
			reg = <0x00080000 0x00040000>;
		};
		partition@5 {
			label = "NAND.u-boot";
			reg = <0x000C0000 0x00100000>;
		};
		partition@6 {
			label = "NAND.u-boot-env";
			reg = <0x001C0000 0x00020000>;
		};
		partition@7 {
			label = "NAND.u-boot-env.backup1";
			reg = <0x001E0000 0x00020000>;
		};
		partition@8 {
			label = "NAND.kernel";
			reg = <0x00200000 0x00800000>;
		};
		partition@9 {
			label = "NAND.rootfs";
			reg = <0x00A00000 0x0D600000>;
		};
		partition@10 {
			label = "NAND.userdata";
			reg = <0x0E000000 0>;
		};
	};
};

通过 DTS 分区配置建立 MTD 存储设备分区流程如下:

omap_nand_probe() /* drivers/mtd/nand/raw/omap2.c */
	...
	err = mtd_device_register(mtd, NULL, 0); /* include/linux/mtd/mtd.h */
		/* @parts = NULL, @nr_parts = 0 */
		mtd_device_parse_register(master, NULL, NULL, parts, nr_parts) /* drivers/mtd/core.c */
			ret = parse_mtd_partitions(mtd, types, parser_data);
				...
				/* 解析 MTD 设备 DTS 设定的分区 */
				ret = mtd_part_of_parse(master, &pparts);
					...
					/* drivers/mtd/ofpart.c */
					parse_fixed_partitions() /* DTS 分区解析 */
				...
				/* 注册 MTD 设备分区到系统 */
				err = add_mtd_partitions(master, pparts.parts, 
						pparts.nr_parts);
					...
					printk(KERN_NOTICE "Creating %d MTD partitions on \"%s\":\n", nbparts, master->name);

					for (i = 0; i < nbparts; i++) {
						/* 创建并初始化 MTD 设备分区对象 (struct mtd_part) */
						slave = allocate_partition(master, parts + i, i, cur_offset);
						...

						/*
						 * . 创建初始化分区 磁盘对象
						 * . 创建分区 /dev/mtdblockN, /dev/mtdN, /dev/mtdNro 设备
						 * ......
						 */
						ret = add_mtd_device(&slave->mtd);
						...
					}

/* 创建并初始化 MTD 设备分区对象 (struct mtd_part) */
slave = allocate_partition(master, parts + i, i, cur_offset); /* drivers/mtd/mtdpart.c */
	....
	struct mtd_part *slave;
	...

	slave = kzalloc(sizeof(*slave), GFP_KERNEL); /* 创建 MTD 分区对象 */
	...
	
	/* set up the MTD object for this partition */
 	slave->mtd.type = parent->type;
 	...
 	slave->mtd.size = part->size;
 	slave->mtd.writesize = parent->writesize;
	slave->mtd.writebufsize = parent->writebufsize;
	slave->mtd.oobsize = parent->oobsize;
	slave->mtd.oobavail = parent->oobavail;
 	...

	/* 设置 MTD 分区 的 操作接口 */

	slave->mtd._read = part_read; /* 读接口 */
	slave->mtd._write = part_write; /* 写接口 */
	...
	if (parent->_block_isbad)
		slave->mtd._block_isbad = part_block_isbad; /* 坏块判定接口 */
	if (parent->_block_markbad)
		slave->mtd._block_markbad = part_block_markbad; /* 坏块标记接口 */
	...
	slave->mtd._erase = part_erase; /* 擦除接口 */
	...

	/*
	 * 打印 MTD 设备分区的 区间范围 和 名字: 
	 * [    1.684146] 0x000000000000-0x000000020000 : "NAND.SPL"
	 */
	printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", (unsigned long long)slave->offset,
		(unsigned long long)(slave->offset + slave->mtd.size), slave->mtd.name);

	/* OOB 空间 layout */
	mtd_set_ooblayout(&slave->mtd, &part_ooblayout_ops);
	...

	if (parent->_block_isbad) {
		uint64_t offs = 0;

		/* 遍历整个 MTD 设备空间, 统计 BBT 保留块 和 坏块 数目 */
		while (offs < slave->mtd.size) 
			if (mtd_block_isreserved(parent, offs + slave->offset)) /* 判定是否为 (用作 BBT 的) 保留块 */ 
				slave->mtd.ecc_stats.bbtblocks++; /* 用作 BBT 的保留块数 +1 */
			else if (mtd_block_isbad(parent, offs + slave->offset)) /* 判定是否为坏块 */
				slave->mtd.ecc_stats.badblocks++; /* 坏块数 +1 */
			offs += slave->mtd.erasesize;
		}
	}
	
/*
 * . 创建初始化分区 磁盘对象
 * . 创建分区 /dev/mtdblockN, /dev/mtdN, /dev/mtdNro 设备
 * ......
 */
add_mtd_device() /* drivers/mtd/mtdcore.c */
	...
	/* 为 MTD 设备分配 minor 设备号 */
	i = idr_alloc(&mtd_idr, mtd, 0, 0, GFP_KERNEL);
	...

	mtd->index = i;
	mtd->usecount = 0;

	...

	mtd->dev.type = &mtd_devtype;
	mtd->dev.class = &mtd_class;
	mtd->dev.devt = MTD_DEVT(i);
	...
	error = device_register(&mtd->dev); /* 注册 MTD 分区的字符设备 /dev/mtd%d */
	...

	list_for_each_entry(not, &mtd_notifiers, list)
		not->add(mtd); /* blktrans_notify_add() */
			blktrans_notify_add() /* drivers/mtd/mtd_blkdevs.c */
				list_for_each_entry(tr, &blktrans_majors, list)
					tr->add_mtd(tr, mtd) = mtdblock_add_mtd(tr, mtd) /* drivers/mtd/mtdblock.c */
						/* 创建 MTD 块设备对象 */
						struct mtdblk_dev *dev = kzalloc(sizeof(*dev), GFP_KERNEL);
						...
						dev->mbd.mtd = mtd; /* 设定关联的 MTD 设备对象 */
						dev->mbd.devnum = mtd->index;

						dev->mbd.size = mtd->size >> 9; /* 块设备按 512 扇区单位进行组织 */
						/*
						 * 绑定 register_mtd_blktrans() 注册的 struct mtd_blktrans_ops: 
						 * &mtdblock_tr, ...
						 */
						if (add_mtd_blktrans_dev(&dev->mbd))
							kfree(dev);
	dev->mbd.tr = tr;

add_mtd_blktrans_dev(&dev->mbd) /* drivers/mtd/mtd_blkdevs.c */
	...
	struct gendisk *gd;
	...
	list_add_tail(&new->list, &tr->devs);
	...
	gd = alloc_disk(1 << tr->part_bits);
	...
	new->disk = gd;
	gd->private_data = new;
	gd->major = tr->major;
	gd->first_minor = (new->devnum) << tr->part_bits;
	gd->fops = &mtd_block_ops;

	...
	/* Create processing workqueue */
	new->wq = alloc_workqueue("%s%d", 0, 0,
				  tr->name, new->mtd->index);
	...
	INIT_WORK(&new->work, mtd_blktrans_work);
	...

	device_add_disk(&new->mtd->dev, gd);
		...
		dev_t devt;
		...
		retval = blk_alloc_devt(&disk->part0, &devt);
		...
		disk_to_dev(disk)->devt = devt;
		...
		disk->major = MAJOR(devt);
		disk->first_minor = MINOR(devt);
		...
		blk_register_region(disk_devt(disk), disk->minors, NULL,
			    exact_match, exact_lock, disk);
		register_disk(parent, disk);
		blk_register_queue(disk);
		...
	...

上述过程,观察到如下内核日志:

[    1.635286] omap-gpmc 50000000.gpmc: GPMC revision 6.0
[    1.640473] gpmc_mem_init: disabling cs 0 mapped at 0x0-0x1000000
[    1.648388] nand: device found, Manufacturer ID: 0x2c, Chip ID: 0xda
[    1.654908] nand: Micron MT29F2G08AAD
[    1.658589] nand: 256 MiB, SLC, erase size: 128 KiB, page size: 2048, OOB size: 64
[    1.666252] nand: using OMAP_ECC_BCH8_CODE_HW ECC scheme
[    1.671692] 11 fixed-partitions partitions found on MTD device omap2-nand.0
[    1.678704] Creating 11 MTD partitions on "omap2-nand.0":
[    1.684146] 0x000000000000-0x000000020000 : "NAND.SPL"
[    1.690415] 0x000000020000-0x000000040000 : "NAND.SPL.backup1"
[    1.697268] 0x000000040000-0x000000060000 : "NAND.SPL.backup2"
[    1.704018] 0x000000060000-0x000000080000 : "NAND.SPL.backup3"
[    1.710719] 0x000000080000-0x0000000c0000 : "NAND.u-boot-spl-os"
[    1.717798] 0x0000000c0000-0x0000001c0000 : "NAND.u-boot"
[    1.724932] 0x0000001c0000-0x0000001e0000 : "NAND.u-boot-env"
[    1.731558] 0x0000001e0000-0x000000200000 : "NAND.u-boot-env.backup1"
[    1.738956] 0x000000200000-0x000000a00000 : "NAND.kernel"
[    1.752614] 0x000000a00000-0x00000e000000 : "NAND.rootfs"
[    1.958515] 0x00000e000000-0x000010000000 : "NAND.userdata"

最后补充一下 register_mtd_blktrans() 注册 struct mtd_blktrans_ops 接口到链表 blktrans_notifier 和 MTD 设备事件通知对象 struct mtd_notifier 到链表 mtd_notifiers 的过程,这是前面确失的部分:

/* drivers/mtd/mtdblock.c */

static struct mtd_blktrans_ops mtdblock_tr = {
	.name		= "mtdblock",
	.major		= MTD_BLOCK_MAJOR,
	.part_bits	= 0,
	.blksize 	= 512,
	.open		= mtdblock_open,
	.flush		= mtdblock_flush,
	.release	= mtdblock_release,
	.readsect	= mtdblock_readsect,
	.writesect	= mtdblock_writesect,
	.add_mtd	= mtdblock_add_mtd,
	.remove_dev	= mtdblock_remove_dev,
	.owner		= THIS_MODULE,
};

static int __init init_mtdblock(void)
{
	return register_mtd_blktrans(&mtdblock_tr);
}

/* drivers/mtd/mtd_blkdevs.c */
int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
{
	struct mtd_info *mtd;
	...

	/* Register the notifier if/when the first device type is
	   registered, to prevent the link/init ordering from fucking
	   us over. */
	if (!blktrans_notifier.list.next)
		register_mtd_user(&blktrans_notifier);
			...
			/* 添加到 全局 MTD 设备(增加、移除)消息 通知链表 */
			list_add(&new->list, &mtd_notifiers);
			...
	...

	list_add(&tr->list, &blktrans_majors);

	...
}

2.1.2 通过 内核命令行 设定 MTD 类设备分区

除了上一小节提到的通过 DTS 指定 MTD 设备分区外,我们还可以通过 内核命令行参数 mtdparts= 来指定 MTD 设备的分区。如:

mtdparts=8000000.nand:128k(NAND.SPL),128k(NAND.SPL.backup1),128k(NAND.SPL.backup2),128k(NAND.SPL.backup3),256k(NAND.u-boot-spl-os),1M(NAND.u-boot),128k(NAND.u-boot-env),128k(NAND.u-boot-env.backup1),8M(NAND.kernel),214M(NAND.rootfs),32M(NAND.userdata)

除了解析接口变为了 parse_cmdline_partitions() 外,其它解析过程与上一小节中描述类似。假设我们仍然使用上一小节中的 OMAP NAND 设备,只是我们使用 内核命令行参数 mtdparts= 的形式,而不是 DTS 来定义设备分区 细节如下:

omap_nand_probe() /* drivers/mtd/nand/raw/omap2.c */
	...
	err = mtd_device_register(mtd, NULL, 0); /* include/linux/mtd/mtd.h */
		/* @parts = NULL, @nr_parts = 0 */
		mtd_device_parse_register(master, NULL, NULL, parts, nr_parts) /* drivers/mtd/core.c */
			ret = parse_mtd_partitions(mtd, types, parser_data);
				...
				/* drivers/mtd/cmdlinepart.c */
				parse_cmdline_partitions() /* 解析内核命令行 MTD 分区参数: mtdparts= */
				...
				/* 注册 MTD 设备分区到系统 */
				err = add_mtd_partitions(master, pparts.parts, 
						pparts.nr_parts);
					// 同前面的分析

2.1.3 小结

MTD (Memory Technology Device) 类设备分区,可通过 DTS内核命令行参数 mtdparts= 两种方式进行设定。MTD 设备驱动调用 mtd_device_register() 将设备注册到系统期间,MTD 子系统通过 register_mtd_parser() 注册的 MTD 分区解析器,解析 MTD 的设备分区并注册到系统。

  • 关联的 sysfs 目录:
/sys/bus/platform/devices/8000000.nand/mtd/*
/sys/class/mtd/*
  • 关联的设备节点:
/dev/mtdN
/dev/mtdNro
/dev/mtdblockN

2.2 块(block) 类设备分区

2.2.1 块(block) 类设备分区解析

通过 块(block)设备驱动 进行管理的 块(block)类设备 的分区,不同于 MTD(Memory Technology Device) 类分区建立过程,它有着不同的方式。
以 Rockchip 的 SFC(Serial Flash Interface) 接口、烧写有 GPT 分区信息的 NAND Flash 分区的建立过程为例,来描述 块(block)类设备 的分区的建立过程。我们只重点关注平台无关的部分:

/* Rockchip 平台相关部分:不用在意 */
ret = rkflash_blk_register(&mytr);
	...
	ret = register_blkdev(blk_ops->major, blk_ops->name);
	...
	blk_ops->rq = blk_mq_init_sq_queue(blk_ops->tag_set, &rkflash_mq_ops, 1,
        		BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
	...
	rkflash_blk_add_dev(dev, blk_ops, &part);
		struct gendisk *gd;

		gd = alloc_disk(1 << blk_ops->minorbits);
		...
		snprintf(gd->disk_name, sizeof(gd->disk_name),
			"%s%d", blk_ops->name, dev->devnum); /* "rkflash0" */
		...
		add_disk(gd); /* 关注的重点 */
/* 平台无关部分 */

add_disk(gd) /* include/linux/genhd.h */
	device_add_disk(NULL, disk, NULL); /* block/genhd.c */
		...
		register_disk(parent, disk);
			...
			bdev->bd_invalidated = 1;
 			err = blkdev_get(bdev, FMODE_READ, NULL);
 				...
 				res = __blkdev_get(bdev, mode, 0);
 					...
 					if (!bdev->bd_openers) {
 						...
 						if (!partno) {
 							...
 							if (bdev->bd_invalidated) {
 								if (!ret)
     									rescan_partitions(disk, bdev); // 见后续分析 (1)
     									...
 							}
 							...
 						}
 						...
 					}
 					...
 				...
 			...
		...

// 接前面 (1) 处分析
rescan_partitions(disk, bdev); /* fs/block_dev.c */
	...
	/*
	 * 1. 解析 block 设备磁盘分区:
  	 * check_partition() 调用 block/partitions/check.c 分区解析接口表 
	 * @check_part[] 中各接口, 尝试解析 block 磁盘分区到 @state .
	 */
	if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) // 见后续分析 (2)
  		return 0;
  	
  	...

	/* add partitions */
	/* 2. 添加 block 磁盘所有分区到系统 */
 	for (p = 1; p < state->limit; p++) {
 		...
 		part = add_partition(disk, p, from, size,
				state->parts[p].flags,
				&state->parts[p].info);
 		...
 	}

// 接前面 (2) 处分析
state = check_partition(disk, bdev) /* block/partitions/check.c */
	struct parsed_partitions *state;
	...

	state = allocate_partitions(hd);
	...
	state->pp_buf = (char *)__get_free_page(GFP_KERNEL);
	state->pp_buf[0] = '\0';

	state->bdev = bdev;
	disk_name(hd, 0, state->name); /* 磁盘名称, 如 rkflash0 */
	snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); /* @state->pp_buf: " rkflash0:" */
	if (isdigit(state->name[strlen(state->name)-1]))
		sprintf(state->name, "p"); /* @state->name: "p" */
	
	/*
	 * 调用 block/partitions/check.c 分区解析接口表 @check_part[] 
	 * 中各接口, 尝试解析 block 磁盘分区 
	 */
	while (!res && check_part[i]) {
		memset(state->parts, 0, state->limit * sizeof(state->parts[0]));
		/* 假设 block 设备的 使用 GPT 分区 */
  		res = check_part[i++](state); /* efi_partition() */ // 见后续分析 (3)
  		...
	}
	...
	if (res > 0) {
		/*
		 * 打印 GPT 分区信息 内核日志:
		 * [    0.513590]  rkflash0: p1 p2 p3 p4 p5 p6
		 */
  		printk(KERN_INFO "%s", state->pp_buf);
  		
  		free_page((unsigned long)state->pp_buf);
  		return state;
  	}
	...

// 接前面 (3) 处分析
res = check_part[i++](state);
	efi_partition(state) /* block/partitions/efi.c */
		...
		
		if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
			...
		}

		pr_debug("GUID Partition Table is valid!  Yea!\n");

		for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) {
			...
			
			put_partition(state, i+1, start * ssz, size * ssz); /* 提取 GPT 分区信息 */

			...
		}

到此,块(block)设备 GPT 类型分区的解析过程已经分析完毕。事实上,块设备支持很多类型的分区解析器:

static int (*check_part[])(struct parsed_partitions *) = {
	...
#ifdef CONFIG_CMDLINE_PARTITION
	/* block/partitions/cmdline.c: 内核命令行参数 "blkdevparts=" */
	cmdline_partition,
#endif
#ifdef CONFIG_EFI_PARTITION
	/* block/partitions/efi.c */
	efi_partition,  /* this must come before msdos */
#endif
#ifdef CONFIG_SGI_PARTITION
	/* block/partitions/sgi.c */
	sgi_partition,
#endif
#ifdef CONFIG_LDM_PARTITION
	/* block/partitions/ldm.c */
	ldm_partition,  /* this must come before msdos */
#endif
#ifdef CONFIG_MSDOS_PARTITION
	/* block/partitions/msdos.c */
	msdos_partition, /* MS DOS 分区 */
#endif
	...
	NULL /* NULL 表示结尾 */
};

可以看到,块(block)设备 支持很多类型的分区,感兴趣的读者可自行查阅相关源码。

2.2.2 块设备 sysfs

/sys/class/block/*
/sys/devices/virtual/block/*

/dev/block/*
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值