将块设备添到系统
调用void blk_register_region()
来将块设备注册到系统中,函数如下所示:
/*
* Register device numbers dev..(dev+range-1)
* range must be nonzero
* The hash chain is sorted on range, so that subranges can override.
*/
void blk_register_region(dev_t devt, unsigned long range, struct module *module,
struct kobject *(*probe)(dev_t, int *, void *),
int (*lock)(dev_t, void *), void *data)
{
kobj_map(bdev_map, devt, range, module, probe, lock, data);
}
该函数会将块设备添加到bdev_map中,这是一个有内核维护的数据库,包含了所有的块设备。在打开块设备是,必然会调用blkdev_get,而blkdev_get会对bdev_map进行查询
添加磁盘和分区到系统中
为了将磁盘添加到系统中,使其可用,必须初始化磁盘数据结构并调用void add_disk(struct gendisk *disk)
方法。其中将磁盘和分区添加到系统中的整个调用流程如下图所示:
接下来我们分别研究研究在该流程中的一些重要函数都做了些什么重要的事情。
add_disk(gendisk*)函数
/**
* add_disk - add partitioning information to kernel list
* @disk: per-device partitioning information
* This function registers the partitioning information in @disk
* with the kernel.
*/
//添加分区磁盘到分区系统
void add_disk(struct gendisk *disk)
{
struct backing_dev_info *bdi;
dev_t devt;
int retval;
/* minors == 0 indicates to use ext devt from part0 and should
* be accompanied with EXT_DEVT flag. Make sure all
* parameters make sense.
*/
WARN_ON(disk->minors && !(disk->major || disk->first_minor));
WARN_ON(!disk->minors && !(disk->flags & GENHD_FL_EXT_DEVT));
disk->flags |= GENHD_FL_UP;
/**
* blk_alloc_devt - allocate a dev_t for a partition
* 根据磁盘的主次设备号信息为磁盘分配设备号
* 这里要特别注意的是:blk_alloc_devt是阻塞的。。。
*/
retval = blk_alloc_devt(&disk->part0, &devt);
if (retval) {
WARN_ON(1);
return;
}
disk_to_dev(disk)->devt = devt;
/* ->major and ->first_minor aren't supposed to be
* dereferenced from here on, but set them just in case.
*/
disk->major = MAJOR(devt);
disk->first_minor = MINOR(devt);
//调用disk_alloc_events初始化磁盘的事件(alloc|add|del|release)处理机制。
//在最开始磁盘事件会被设置为被阻塞的。
disk_alloc_events(disk);
//调用bdi_register_dev将磁盘注册到bdi
bdi = &disk->queue->backing_dev_info;
bdi_register_dev(bdi, disk_devt(disk));
//调用blk_register_region将磁盘添加到bdev_map中
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
/* 调用register_disk将磁盘添加到系统中。主要完成:
* 将主设备的分区(第0个分区)信息标记设置为分区无效
* 调用device_add将设备添加到系统中
* 在sys文件系统中为设备及其属性创建目录及文件
* 发出设备添加到系统的uevent事件(如果能获取分区的信息,则也为分区发送uevent事件)。
*/
register_disk(disk);
/*
调用blk_register_queue注册磁盘的请求队列。主要是为队列和队列的调度器在设备的
sys文件系统目录中创建相应的sys目录/文件,并且发出uevent事件。
*/
blk_register_queue(disk);
/*
* Take an extra ref on queue which will be put on disk_release()
* so that it sticks around as long as @disk is there.
*/
WARN_ON_ONCE(!blk_get_queue(disk->queue));
retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
"bdi");
WARN_ON(retval);
/*
调用__disk_unblock_events完成
在/sys文件系统的设备目录下创建磁盘的事件属性文件
将磁盘事件添加到全局链表disk_events中
解除对磁盘事件的阻塞。
*/
disk_add_events(disk);
}
register_disk
static void register_disk(struct gendisk *disk){
//......
//该设备上分区是否有效,1表示无效,如果无效,则下次打开时会重新扫描分区表
bdev->bd_invalidated = 1;
err = blkdev_get(bdev, FMODE_READ, NULL);
//......
}
add_partition()
struct hd_struct *add_partition(struct gendisk *disk, int partno,
sector_t start, sector_t len, int flags,
struct partition_meta_info *info)
{
struct hd_struct *p;
dev_t devt = MKDEV(0, 0);
struct device *ddev = disk_to_dev(disk);
struct device *pdev;
struct disk_part_tbl *ptbl;
const char *dname;
int err;
err = disk_expand_part_tbl(disk, partno);
//......
seqcount_init(&p->nr_sects_seq);
pdev = part_to_dev(p);
p->start_sect = start;
p->alignment_offset =
queue_limit_alignment_offset(&disk->queue->limits, start);
p->discard_alignment =
queue_limit_discard_alignment(&disk->queue->limits, start);
p->nr_sects = len;
p->partno = partno;
p->policy = get_disk_ro(disk);
if (info) {
struct partition_meta_info *pinfo = alloc_part_info(disk);
if (!pinfo)
goto out_free_stats;
memcpy(pinfo, info, sizeof(*info));
p->info = pinfo;
}
dname = dev_name(ddev);
if (isdigit(dname[strlen(dname) - 1]))
dev_set_name(pdev, "%sp%d", dname, partno);
else
dev_set_name(pdev, "%s%d", dname, partno);
device_initialize(pdev);
pdev->class = &block_class;
pdev->type = &part_type;
pdev->parent = ddev;
err = blk_alloc_devt(p, &devt);
if (err)
goto out_free_info;
pdev->devt = devt;
/* delay uevent until 'holders' subdir is created */
dev_set_uevent_suppress(pdev, 1);
err = device_add(pdev);
if (err)
goto out_put;
err = -ENOMEM;
p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
if (!p->holder_dir)
goto out_del;
dev_set_uevent_suppress(pdev, 0);
if (flags & ADDPART_FLAG_WHOLEDISK) {
err = device_create_file(pdev, &dev_attr_whole_disk);
if (err)
goto out_del;
}
/* everything is up and running, commence */
rcu_assign_pointer(ptbl->part[partno], p);
/* suppress uevent if the disk suppresses it */
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
hd_ref_init(p);
return p;
//....
}
块设备操作
在所有的文件系统中,在获取文件的inode时,对于不是常规文件的特殊文件都会调用init_special_inode,对于块设备文件,该函数会将inode的文件操作函数结构设置为def_blk_fops (定义如下),其中打开文件的函数原型为:blkdev_open
const struct file_operations def_blk_fops 定义:
const struct file_operations def_blk_fops = {
.open = blkdev_open,//打开文件函数
.release = blkdev_close,
.llseek = block_llseek,
.read = new_sync_read,
.write = new_sync_write,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_blkdev_ioctl,
#endif
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
};
打开文件函数为blkdev_open。其原型为:
static int blkdev_open(struct inode * inode, struct file * filp){
struct block_device *bdev;
/*
* Preserve backwards compatibility and allow large file access
* even if userspace doesn't ask for it explicitly. Some mkfs
* binary needs it. We might want to drop this workaround
* during an unstable branch.
*/
filp->f_flags |= O_LARGEFILE;
if (filp->f_flags & O_NDELAY)
filp->f_mode |= FMODE_NDELAY;
if (filp->f_flags & O_EXCL)
filp->f_mode |= FMODE_EXCL;
if ((filp->f_flags & O_ACCMODE) == 3)
filp->f_mode |= FMODE_WRITE_IOCTL;
bdev = bd_acquire(inode);
if (bdev == NULL)
return -ENOMEM;
filp->f_mapping = bdev->bd_inode->i_mapping;
return blkdev_get(bdev, filp->f_mode, filp);
}
参数的含义很明显。它完成的工作有:
- 调用bd_acquire(struct inode *)来获取块设备文件的block_device结构。该函数会调用
bdget
尝试从bdev文件系统中查找设备文件对应的inode,如果有就直接返回,如果没有就会分配一个新的inode并且初始化该inode在返回。设备文件的inode会被添加到block_device
的bd_inodes
中。设备对应的block_device也会在这一步被添加到全局的all_bdevs
中 - 设置file的结构
f_mapping
为bdev->bd_inode->i_mapping
(filp->f_mapping = bdev->bd_inode->i_mapping;
)。bdev->bd_inode在inode的创建和初始化中被初始化,具体的函数为alloc_inode
和gdget
。其中的address_space_operations被设置为def_blk_aops,这是和设备交互的接口 - 调用
blkdev_get
。该函数主要的工作是完成设备的打开动作,同事根据传入的模式还可能声明设备的持有者。其实质上blkdev_get
的实质性动作是由__blkdev_get
完成的。__blkdev_get
主要做了如下几件事情
- 调用
get_gendisk
获取块设备所对应的通用磁盘结构,这里可能需要查询bdev_map
数据库 - 阻塞磁盘的事件处理。如果是第一次打开该设备,则
- 填充块设备数据结构bd_disk ,bd_queue,bd_contains
- 如果是主设备(即不是分区),则:
- 设置块设备数据结构的bd_part
- 如果提供了
disk->fops-open
则调用 - 如果分区无效bdev->bd_invalidate,则调用rescan_partitions重新扫描分区
- 如果打开设备时返回了ENOMEDIUM错误,则条用
invalidate_partitions
将所有分区设置为无效
- 如果是分区设备,则:
- 获取主设备的块设备数据结构
- 递归调用
__blkdev_get
,但是这次传入的是主设备的块设备数据结构。本次调用会走第一次打开设备并且是主设备的分支,由于是第一次打开,而且分区信息应该是无效的,这就会走到重新扫描分区的分支。 - 设置块设备数据结构的
bd_contains
(它被设置为主设备的block_device) - 调用
bd_set_size
设置分区的大小信息
- 否则如果不是第一次打开设备,则
- 如果是主设备(这里通过
bdev->bd_contains==bdev
来进行性判断),参照该函数之前的流程,只有为主设备是该条件才成立, - 依据
disk->fops->open
的情况进行调用,以及按照不同条件分别调用rescan_partitions
invalidate_partitions
- 增加设备的打开技术,解除对设备时间的阻塞。
- 如果是主设备(这里通过
- 调用
从以上blkdev_open
打开的细节也可以看到,blkdev_open确实会调用驱动所提供的open函数,驱动可以在open中完成打开设备的必要工作。在打开之后设备就可以被使用了。
读写操作
在介绍具体的写操作之前,先研究下与读写操作紧密相关的数据结构
待更新
请求操作
当内核通过address_space_operations中的成员函数向设备发起读写操作时,读写操作都会被转变成一个对块设备的IO请求提交给设备。内核使用数据结构struct bio来表示一个对块设备的IO,其定义如下:
/*
* main unit of I/O for the block layer and lower layers (ie drivers and
* stacking drivers)
*/
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
unsigned long bi_flags; /* status, command, etc */
/* bottom bits READ/WRITE,
* top bits priority
*/
unsigned long bi_rw;
struct bvec_iter bi_iter;
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
unsigned int bi_phys_segments;
/*
* To keep track of the max segment size, we account for the
* sizes of the first and last mergeable segments in this bio.
*/
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
atomic_t bi_remaining;
bio_end_io_t *bi_end_io;
void *bi_private;
#ifdef CONFIG_BLK_DEV_THROTTLING
bio_throtl_end_io_t *bi_throtl_end_io1;
void *bi_throtl_private1;
bio_throtl_end_io_t *bi_throtl_end_io2;
void *bi_throtl_private2;
#endif
#ifdef CONFIG_BLK_CGROUP
/*
* Optional ioc and css associated with this bio. Put on bio
* release. Read comment on top of bio_associate_current().
*/
struct io_context *bi_ioc;
struct cgroup_subsys_state *bi_css;
#endif
union {
#if defined(CONFIG_BLK_DEV_INTEGRITY)
struct bio_integrity_payload *bi_integrity; /* data integrity */
#endif
};
unsigned short bi_vcnt; /* how many bio_vec's */
/*
* When using dircet-io (O_DIRECT), we can't get the inode from a bio
* by walking bio->bi_io_vec->bv_page->mapping->host
* since the page is anon.
*/
struct inode *bi_dio_inode;
/*
* Everything starting with bi_max_vecs will be preserved by bio_reset()
*/
unsigned short bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[0];
};
这个数据结构中,绝大部分可以参照注释对各个成员的含义进行理解;需要特别注意的是struct bio_vec *bi_io_vec;
成员
/*
* Request flags. For use in the cmd_flags field of struct request, and in
* bi_rw of struct bio. Note that some flags are only valid in either one.
*/
enum rq_flag_bits {
/* common flags */
__REQ_WRITE, /* not set, read. set, write */
__REQ_FAILFAST_DEV, /* no driver retries of device errors */
__REQ_FAILFAST_TRANSPORT, /* no driver retries of transport errors */
__REQ_FAILFAST_DRIVER, /* no driver retries of driver errors */
__REQ_SYNC, /* request is sync (sync write or read) */
__REQ_META, /* metadata io request */
__REQ_PRIO, /* boost priority in cfq */
__REQ_DISCARD, /* request to discard sectors */
__REQ_SECURE, /* secure discard (used with __REQ_DISCARD) */
__REQ_WRITE_SAME, /* write same block many times */
__REQ_NOIDLE, /* don't anticipate more IO after this one */
__REQ_INTEGRITY, /* I/O includes block integrity payload */
__REQ_FUA, /* forced unit access */
__REQ_FLUSH, /* request for cache flush */
__REQ_POST_FLUSH_BARRIER,/* cache barrier after a data req */
__REQ_BARRIER, /* marks flush req as barrier */
__REQ_BG, /* background activity */
__REQ_FG, /* foreground activity */
/* bio only flags */
__REQ_RAHEAD, /* read ahead, can fail anytime */
__REQ_THROTTLED, /* This bio has already been subjected to
* throttling rules. Don't do it again. */
/* request only flags */
__REQ_SORTED = __REQ_RAHEAD, /* elevator knows about this request */
__REQ_SOFTBARRIER, /* may not be passed by ioscheduler */
__REQ_NOMERGE, /* don't touch this for merging */
__REQ_STARTED, /* drive already may have started this one */
__REQ_DONTPREP, /* don't call prep for this one */
__REQ_QUEUED, /* uses queueing */
__REQ_ELVPRIV, /* elevator private data attached */
__REQ_FAILED, /* set if the request failed */
__REQ_QUIET, /* don't worry about errors */
__REQ_PREEMPT, /* set for "ide_preempt" requests and also
for requests for which the SCSI "quiesce"
state must be ignored. */
__REQ_ALLOCED, /* request came from our alloc pool */
__REQ_COPY_USER, /* contains copies of user pages */
__REQ_FLUSH_SEQ, /* request for flush sequence */
__REQ_IO_STAT, /* account I/O stat */
__REQ_MIXED_MERGE, /* merge of different types, fail separately */
__REQ_PM, /* runtime pm request */
__REQ_HASHED, /* on IO scheduler merge hash */
__REQ_MQ_INFLIGHT, /* track inflight for MQ */
__REQ_URGENT, /* urgent request */
__REQ_NR_BITS, /* stops here */
};
提交读写请求
这里submit_bio
用于递交一个bio到block device层进行I/O操作。
kernel/block/blk-core.c
/**
* submit_bio - submit a bio to the block device layer for I/O
* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
* @bio: The &struct bio which describes the I/O
*
* submit_bio() is very similar in purpose to generic_make_request(), and
* uses that function to do most of the work. Both are fairly rough
* interfaces; @bio must be presetup and ready for I/O.
*
*/
void submit_bio(int rw, struct bio *bio)
{
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio)) {
unsigned int count;
//从request_queue队列中获取,或者直接获得sector的数据
if (unlikely(rw & REQ_WRITE_SAME))
count = bdev_logical_block_size(bio->bi_bdev) >> 9;
else
count = bio_sectors(bio);
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
#ifdef DCHECK_ROOT_FORCE
check_wrt(rw, bio);
#endif
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
struct task_struct *tsk;
tsk = get_dirty_task(bio);
printk(KERN_WARNING "%s(%d): %s block %Lu on %s (%u sectors)\n",
tsk->comm, task_pid_nr(tsk),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_iter.bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
generic_make_request(bio);
}
接下来看看generic_make_request
函数,用于处理一个设备驱动层面的用于I/O的buffer。其中参数struct bio
描述了该buffer在内存中、在设备中的位置。
generic_make_request
用来传递block devices 的I/O请求,所有的I/O所需的信息都被保存在struc bio
数据结构中generic_make_request
不会有任何状态返回。当当前的request成功或者失败时,使用异步进行单独通知,通常由bio->bi_end_io
所指向的函数完成。generic_make_request
函数调用者必须要保证如下条件:
bi_io_vec
结构所指向的是正确的memory buffer;bi_dev
和bi_sector
指向正确的地址bio_end_io
和bi_private
指向一个能对generic_make_request
状态进行处理的函数
generic_make_request
函数在某些request使用bi_next
并需要合并的情况下会被调用;并递归调用该函数将**bio**submit到底层的设备。这也就意味着当调用了->make_request_fn
之后bio就被禁止操作。
/**
*/
void generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
if (!generic_make_request_checks(bio))
return;
/* 我们希望函数`->make_request_fn`在同一时刻只被激活一次,当尝试第二次激活或
* 者称之为非当前首次使用者时会报错。所以,我们使用current->bio_list来对函数
* `make_request_fn`所提交的request进行管理;current->bio_list通常也会被
* 用作为标识generic_make_request是否在当前task中被激活的一个flag。
* 如果是NULL,也就意味着没有make_request是激活的,
* 如果不是NULL,那么一个make_request会被激活,一个新的request会被添加到尾部
*/
if (current->bio_list) {
bio_list_add(current->bio_list, bio);
return;
}
/* 对于下面的循环,稍作注释:
* 在进入循环之前`bio->bi->next`其实是NULL(在调用之前都被设置为NULL),所以
* 此时实际上只有一个单独的bio。因此模拟一个较长的bio list,新分配一个指针
* 用于指向bio_list_on_stack,这样就类似于用bio_list_on_stack将bio_list
* 进行了初始化,`->make_request`此时添加了更多的bios可用于递归调用
*/
BUG_ON(bio->bi_next);
bio_list_init(&bio_list_on_stack);
current->bio_list = &bio_list_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
q->make_request_fn(q, bio);
bio = bio_list_pop(current->bio_list);
} while (bio);
current->bio_list = NULL; /* deactivate */
}