1,总体架构:
块设备驱动框架是Linux设备最重要的框架之一,涉及内核的vfs,设备驱动模型等模块,是内核中异常复杂的一个框架。我们先看一下块设备设计的主要框架结构,先从总体上对块设备有个初步的认识:
2,块设备框架分析
1,块设备的表示gendisk:
内核使用 struct gendisk 结构实例来表示一个块设备。一个块设备通常表示一个物理磁盘,每个块设备逻辑上可以被分成多个分区,每个分区用 struct hd_struct 结构表示。
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */ // 主设备号
int first_minor; // 起始从设备号
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */ // 从设备个数
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, umode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl; // 分区表
struct hd_struct part0; // 整个磁盘分区,part_tbl[0]指向part0
const struct block_device_operations *fops; // 块设备操作方法
struct request_queue *queue; // 请求队列(用于请求合并,异步IO,电梯调度等)
void *private_data; // 设备特定的信息
int flags;
struct rw_semaphore lookup_sem;
struct kobject *slave_dir;
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct kobject integrity_kobj;
#endif /* CONFIG_BLK_DEV_INTEGRITY */
int node_id;
struct badblocks *bb;
struct lockdep_map lockdep_map;
};
struct hd_struct {
sector_t start_sect; // 起始扇区号
/*
* nr_sects is protected by sequence counter. One might extend a
* partition while IO is happening to it and update of nr_sects
* can be non-atomic on 32bit machines with 64bit sector_t.
*/
sector_t nr_sects; // 扇区个数
seqcount_t nr_sects_seq;
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev; // 设备模型
struct kobject *holder_dir;
int policy, partno; // 分区号
struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
int make_it_fail;
#endif
unsigned long stamp;
atomic_t in_flight[2];
#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
struct disk_stats dkstats;
#endif
struct percpu_ref ref;
struct rcu_head rcu_head;
};
内核提供alloc_disk()接口,实现gendisk和hd_struct的分配和释放,及初始化。驱动程序需要提供主从设备号,fops以及设备特定的private_data。
2,块设备的注册:
驱动程序分配并初始化一个块设备之后,通过 add_disk() 接口完成块设备的注册。add_disk()分配一个struct bdev_inode结构,该结构是struct block_device和struct inode的结合体,分别用来链入all_bdevs和inode_hashtable链表中。其中block_device中的bd_disk用来指向注册的gendisk。block_device和inode中分别通过bd_dev和i_rdev记录该gendisk的设备号。至此,上层可以通过设备号从all_bdevs或者inode_hashtable链表中找到该 bdev_inode结构,然后通过block_device结构的bd_disk找到相应的gendisk,调用gendisk的fops操作设备特定的private_data。
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
};
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */ // 设备号
int bd_openers;
struct inode * bd_inode; /* will die */
struct super_block * bd_super;
struct mutex bd_mutex; /* open/close mutex */
void * bd_claiming;
void * bd_holder;
int bd_holders;
bool bd_write_holder;
#ifdef CONFIG_SYSFS
struct list_head bd_holder_disks;
#endif
struct block_device * bd_contains;
unsigned bd_block_size; // 设备块大小
u8 bd_partno; // 分区号
struct hd_struct * bd_part; // 分区结构
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;
int bd_invalidated;
struct gendisk * bd_disk; // 指向的块设备
struct request_queue * bd_queue; // 设备的请求队列
struct backing_dev_info *bd_bdi;
struct list_head bd_list; // all_devs链表节点
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;
/* The counter of freeze processes */
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
} __randomize_layout;
struct inode {
umode_t i_mode;
unsigned short i_opflags;
kuid_t i_uid;
kgid_t i_gid;
unsigned int i_flags;
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
const struct inode_operations *i_op;
struct super_block *i_sb;
struct address_space *i_mapping; // 指向 i_data,内核初始化i_data结构,并默认初始化i_mapping指向i_data
#ifdef CONFIG_SECURITY
void *i_security;
#endif
/* Stat data, not accessed from path walking */
unsigned long i_ino;
/*
* Filesystems may only read i_nlink directly. They shall use the
* following functions for modification:
*
* (set|clear|inc|drop)_nlink
* inode_(inc|dec)_link_count
*/
union {
const unsigned int i_nlink;
unsigned int __i_nlink;
};
dev_t i_rdev; // 设备号
loff_t i_size;
struct timespec64 i_atime;
struct timespec64 i_mtime;
struct timespec64 i_ctime;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned short i_bytes;
u8 i_blkbits;
u8 i_write_hint;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
/* Misc */
unsigned long i_state;
struct rw_semaphore i_rwsem;
unsigned long dirtied_when; /* jiffies of first dirtying */
unsigned long dirtied_time_when;
struct hlist_node i_hash;
struct list_head i_io_list; /* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
struct bdi_writeback *i_wb; /* the associated cgroup wb */
/* foreign inode detection, see wbc_detach_inode() */
int i_wb_frn_winner;
u16 i_wb_frn_avg_time;
u16 i_wb_frn_history;
#endif
struct list_head i_lru; /* inode LRU list */
struct list_head i_sb_list; // inode_hashtable链表节点
struct list_head i_wb_list; /* backing dev writeback list */
union {
struct hlist_head i_dentry;
struct rcu_head i_rcu;
};
atomic64_t i_version;
atomic_t i_count;
atomic_t i_dio_count;
atomic_t i_writecount;
#ifdef CONFIG_IMA
atomic_t i_readcount; /* struct files open RO */
#endif
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct file_lock_context *i_flctx;
struct address_space i_data; // 内核初始化 aops 方法集
struct list_head i_devices;
union {
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev; // 指向block_device结构
struct cdev *i_cdev;
char *i_link;
unsigned i_dir_seq;
};
__u32 i_generation;
#ifdef CONFIG_FSNOTIFY
__u32 i_fsnotify_mask; /* all events this inode cares about */
struct fsnotify_mark_connector __rcu *i_fsnotify_marks;
#endif
#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
struct fscrypt_info *i_crypt_info;
#endif
void *i_private; /* fs or device private pointer */
} __randomize_layout;
3,块设备节点注册:
块设备gendisk分配,初始化并注册进内核之后,此时用户还无法进行访问,必须通过创建块设备节点的方式使得用户可以访问该设备。内核提供类似 ext2_mknod()的方法在文件系统/dev/下创建一个块设备inode,包含该块设备的设备号。这样,用户可以通过打开该块设备节点,根据该节点中的设备号去寻找相应的bdev。
4,块设备的打开和读写:
块设备的打开,读写遵循linux的vfs框架架构。这里简要描述一下块设备的打开在vfs层中的逻辑:用户调用系统调用open()打开一个块设备的时候后,kernel调用vfs_open(),通过dentry,加载设备文件 inode,根据设备文件inode的i_mode为S_IFBLK,将设备文件inode的i_fop赋值为def_blk_fops,通过调用def_blk_fops的open()方法,查找到注册进内核的bdev,将设备inode的i_bdev指向块设备的bdev,并将file和设备inode的i_mapping都指向bdev_inode的i_mapping。
当用户通过打开的文件file去读写时,通过标准的vfs操作,调用vfs_read()或者vfs_write()。这两个函数调用def_blk_fops的read_iter()和write_iter()方法。这两个方法内部调用file->f_mapping->aops->readpage()或者file->f_mapping->aops->writepage()实现文件的读写。readpage()和writepage()最终调用submit_bh()构建request并提交request_queue。
3,块设备驱动编写
从上面块设备框架分析可以看出,块设备涉及的链路非常长,光是嵌入在vfs层中的逻辑就很错综复杂,更何况还有设备的异步请求队列的机制等等。好在linux帮助我们实现了健壮而又灵活的框架,使得我们编写一个块设备驱动程序相对容易许多。
(1)首先,由于历史问题,块设备的物理结构被设计成 磁头,柱面和扇区的组合。内核使用512字节作为内部扇区大小参与计算等,但硬件可能实现512,1024和2048等作为扇区大小,因此,设备驱动程序在同内核接口交互时,要注意扇区大小的转换。
(2)块设备驱动程序一般通过alloc_disk()和add_disk()接口分配和注册。需要手动设置gendisk的fops(用于设备打开,关闭和配置ioctl等的方法),及 private_data。
(3)使用内核接口分配并初始化块设备的request_queue。如果读写操作使用设备的request_queue,需要提供request_fn方法,以便进行实际的request处理。如果读写操作不适用request_queue,可以直接提供make_request_fn方法,这样读写操作在调用submit_bh()方法时,立即调用自定义的make_request_fn,实现数据的及时读写。
示例程序后续提供。