块设备概述
struct bdev_inde 用来表示块设备,其数据结构如上图所示。乳沟找到该块设备在文件系统中的inode,就可以找到该块设备对应的struct block_device,进而就可以对设备进行操作了。
块设备的设备文件file_operations结构中提供的是通用的读写看书,而不是驱动自己的读写函数。用户发出的读写指令,其实质是读写缓冲区的数据。块驱动才是真正负责缓冲器数据和物理数据之间的交换,所有的真实读写物理设备都是在设备读写请求完成之后被执行的,因而块设备的主要工作就是完成请求的处理。这里主要可以分为两部分:
- 对很多请求的管理
- 指向该队列调度器的指针,调度器用于管理该请求队列上的请求
- 为该请求队列分配请求结构的内存池。该内存池用于为该设备分配读写请求结构
- 请求队列的设置信息
- 对单一请求中所包含的操作的管理
- 该请求的gendisk 和hd_struct
- 该请求的bio信息
- 请求完成时的回调函数指针
block 设备的结构图
注册块设备
注册
int register_blkdev(unsigned int major, const char *name);
- major:该设备的主设备号,如果为0,则该函数会为该设备分配一个主设备号。
- name:该设备的名字
解除注册
int unregister_blkdev(unsigned int major, const char *name);
块设备数据结构
块设备
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key 设备号*/
int bd_openers;/*设备打开次数*/
struct inode * bd_inode; /* will die 该设备的inode*/
struct super_block * bd_super;/*指向该设备所在文件系统的超级块,即bdev的超级块*/
struct mutex bd_mutex; /* open/close mutex 互斥锁 */
struct list_head bd_inodes;/*链表头,该链表包含了表示该块设备的所有设备文件的inode*/
void * bd_claiming;/*申请获取设备者*/
void * bd_holder;/*当前持有设备者*/
int bd_holders;/*设备有多少个持有者*/
bool bd_write_holder;/*是否是写持有*/
#ifdef CONFIG_SYSFS
struct list_head bd_holder_disks;
#endif
struct block_device * bd_contains;/*该设备所属的块设备,如果该设备就表示整个块设备,则它为NULL*/
unsigned bd_block_size;/*该设备的块大小*/
struct hd_struct * bd_part;/*指向该块设备的hd_struct*/
/* number of times partitions within this device have been opened. */
unsigned bd_part_count;/*该块设备上的分区被引用的次数,如果不为0,则不能重新扫描分区,因为分区正被使用*/
int bd_invalidated;/*该设备上分区是否有效,1表示无效,如果无效,则下次打开时会重新扫描分区表*/
struct gendisk * bd_disk;/*指向该设备所对应的gendisk*/
struct request_queue * bd_queue;/*该设备对应的请求队列*/
struct list_head bd_list;/*用于将所有的块设备添加到all_bdevs中*/
/*
* Private data. You must have bd_claim'ed the block_device
* to use this. NOTE: bd_claim allows an owner to claim
* the same device multiple times, the owner must take special
* care to not mess up bd_private for that case.
*/
unsigned long bd_private;/*给设备的当前持有者使用的私有数据结构*/
/* The counter of freeze processes */
int bd_fsfreeze_count;
/* Mutex for freeze */
struct mutex bd_fsfreeze_mutex;
}
通用磁盘和分区数据结构
struct block_device用于向驱动程序呈现一个快设备,而另外一个数据结构struct gendisk 则表示整个磁盘,一个磁盘可能包括多个Struct block_device类型的设备
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */
int first_minor;/*磁盘的第一个次设备号*/
/*磁盘的最大次设备号数目,如果为1,则磁盘不能分区*/
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, umode_t *mode);/*获取设备的devnode*/
unsigned int events; /* supported events 支持的时间 */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl;/*一个数组,包含了该磁盘的所有分区*/
/*该磁盘的第0个分区。实际上它不代表真正的分区,它代表整个磁盘(或者主设备),
分区数组的0号元素指向它,当磁盘包含分区时,分区在分区数组中的下表1开始*/
struct hd_struct part0;
const struct block_device_operations *fops;/*该磁盘的操作函数*/
struct request_queue *queue;/*用于队列管理*/
void *private_data;
int flags;/*磁盘的标志,表示磁盘的状态*/
struct device *driverfs_dev; /*表示磁盘的所属device*/ // FIXME: remove
struct kobject *slave_dir;/*用于在sys文件系统中创建一个该磁盘文件的slaves目录*/
struct timer_rand_state *random;
atomic_t sync_io; /* RAID */
struct disk_events *ev;/*用于检测磁盘的事件*/
#ifdef CONFIG_BLK_DEV_INTEGRITY
struct blk_integrity *integrity;
#endif
int node_id;/*该数据结构所使用的NUMA节点*/
};
分区数据结构struct hd_struct定义如下
struct hd_struct {
sector_t start_sect;/*起始扇区号*/
/*
* nr_sects is protected by sequence counter. One might extend a
* partition while IO is happening to it and update of nr_sects
* can be non-atomic on 32bit machines with 64bit sector_t.
*/
sector_t nr_sects;/*该分区扇区数目。分区0的该域保存的是整个磁盘的扇区数目,也就是磁盘的容量*/
seqcount_t nr_sects_seq;
sector_t alignment_offset;
unsigned int discard_alignment;
struct device __dev;/*该分区所对应的设备数据结构*/
struct kobject *holder_dir;/*指向分区所在的父设备的kobject*/
int policy, partno;
struct partition_meta_info *info;
#ifdef CONFIG_FAIL_MAKE_REQUEST
int make_it_fail;
#endif
unsigned long stamp;
atomic_t in_flight[2];
#ifdef CONFIG_SMP
struct disk_stats __percpu *dkstats;
#else
struct disk_stats dkstats;
#endif
atomic_t ref;/*分区的引用计数*/
struct rcu_head rcu_head;
};
Struct gendisk的实例不能由驱动程序分配,必须使用API alloc_disk来分配,该API 完成struct gendisk数据结构的分配和初始化,并调用该设备模型的API device_initialize完成设备数据结构的初始化,使用完成之后,必须使用del_gendisk来释放
块设备、通用磁盘以及分区数据结构之间的关系
对于一个磁盘上的每个分区都对应一个struct block_devices数据结构,分区的struct block_device会通过bd_contains指向整个磁盘的Struct block_device
每一个磁盘只有唯一的已给struct gendisk结构
磁盘上的所有struct block_device都通过bd_disk指向表示磁盘的struct gendisk结构
磁盘的struct gendisk 的part指向一个struct hd_struct的指针数组,每个数组项对应的一个分区信息,如果一个Struct block_device表示的是分区,则它的bd_part指向对应的分区数据结构。分区信息中的__dev以及holder_dir将磁盘的分层信息呈现到了kobject中
块设备操作
字符设备使用了file_operations作为底层所使用的操作函数集,但是块设备不适用该结构;块设备使用的是struct block_device_operations,其定义如下:
struct block_device_operations {
int (*open) (struct block_device *, fmode_t);
void (*release) (struct gendisk *, fmode_t);
int (*rw_page)(struct block_device *, sector_t, struct page *, int rw);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*direct_access) (struct block_device *, sector_t,
void **, unsigned long *);/*直接访问设备*/
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);/*检查是否有事件发生*/
/* ->media_changed() is DEPRECATED, use ->check_events() instead */
int (*media_changed) (struct gendisk *);/*当介质改变时调用,建议不再使用更改API,而是使用check_events*/
void (*unlock_native_capacity) (struct gendisk *);/*解除本地的容量限制,用于支持超过EOD的访问*/
int (*revalidate_disk) (struct gendisk *);/*重新使设备生效*/
int (*getgeo)(struct block_device *, struct hd_geometry *);/*获取设备的物理信息*/
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
struct module *owner;
};
请求队列
struct request_queue {
/*
* Together with queue_head for cacheline sharing
*/
/*将该队列上的请求连接到一起的链表。链表上的每个元素都是一个struct request类型的结构
代表一个读写请求。*/
struct list_head queue_head;
/*指向该队列使用的调度算法。该调度算法用于对请求队列上的请求进行重排、优化以得到最好的性能。*/
struct request *last_merge;
/*指向该队列使用的调度算法,该调度算法用于对请求队列上的请求进行重排、优化以达到最好性能*/
struct elevator_queue *elevator;
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
struct rq_wb *rq_wb;
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other
* blkgs from their own blkg->rl. Which one to use should be
* determined using bio_request_list().
*/
struct request_list root_rl;
/*请求处理函数。当内核期望驱动程序执行某些动作时,如写数据到设备或者从设备读取数据
时,内核会自动调用该函数。因此驱动程序必须提供该函数,他是块驱动框架和设备的接口*/
request_fn_proc *request_fn;
/*创建新请求。内核提供有该函数的默认版本,在默认版本中,内核会向请求队列中添加请求,
如果队列中有足够多的请求,则就调用request_fn来处理请求。如果不想使用内核提供的默认实现,
驱动开发者就要自己实现*/
make_request_fn *make_request_fn;
/*请求预备函数,大多数驱动不适用该功能,而是将他设置为NULL。如果实现了该函数,则它的功能
应该是在发出请求之前预先准备好一个请求*/
prep_rq_fn *prep_rq_fn;
/*取消请求的准备,在请求被处理完成时可能会被调用。如果在请求预备函数中分配了一些资源,这是一个
释放资源的地方,*/
unprep_rq_fn *unprep_rq_fn;
/*用于确定一个现存的请求释放允许添加更多的数据。由于请求队列的长度是有限的,因而提供该检测
可以在队列已满的情况下用于检测释放可以向已存请求添加数据。如果尅,则就可以添加*/
merge_bvec_fn *merge_bvec_fn;
/*使用软终端异步完成请求时用于通知驱动程序请求已完成*/
softirq_done_fn *softirq_done_fn;
/*当请求超时时执行的函数*/
rq_timed_out_fn *rq_timed_out_fn;
/*判断dma释放被消耗殆尽,如果是返回0*/
dma_drain_needed_fn *dma_drain_needed;
/*当设备忙时调用该函数*/
lld_busy_fn *lld_busy_fn;
struct blk_mq_ops *mq_ops;
unsigned int *mq_map;
/* sw queues */
struct blk_mq_ctx __percpu *queue_ctx;
unsigned int nr_queues;
unsigned int queue_depth;
/* hw dispatch queues */
struct blk_mq_hw_ctx **queue_hw_ctx;
unsigned int nr_hw_queues;
/*
* Dispatch queue sorting
*/
sector_t end_sector;
struct request *boundary_rq;
/*
* Delayed queue handling
*/
struct delayed_work delay_work;
struct backing_dev_info backing_dev_info;
/*
* The queue owner gets to use this for whatever they like.
* ll_rw_blk doesn't touch it.
*/
void *queuedata;
/*
* various queue flags, see QUEUE_* below
*/
unsigned long queue_flags;
/*
* ida allocated id for this queue. Used to index queues from
* ioctx.
*/
int id;
/*
* queue needs bounce pages for pages above this limit
*/
gfp_t bounce_gfp;
/*
* protects queue structures from reentrancy. ->__queue_lock should
* _never_ be used directly, it is queue private. always use
* ->queue_lock.
*/
spinlock_t __queue_lock;
spinlock_t *queue_lock;
/*
* queue kobject
*/
struct kobject kobj;
/*
* mq queue kobject
*/
struct kobject mq_kobj;
#ifdef CONFIG_PM_RUNTIME
struct device *dev;
int rpm_status;
unsigned int nr_pending;
#endif
/*
* queue settings
*/
unsigned long nr_requests; /* Max # of requests */
unsigned int nr_congestion_on;
unsigned int nr_congestion_off;
unsigned int nr_batching;
unsigned int dma_drain_size;
void *dma_drain_buffer;
unsigned int dma_pad_mask;
unsigned int dma_alignment;
struct blk_queue_tag *queue_tags;
struct list_head tag_busy_list;
unsigned int nr_sorted;
unsigned int in_flight[2];
#ifdef CONFIG_WBT
struct blk_rq_stat rq_stats[4];
#endif
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
* queue_lock internally, e.g. scsi_request_fn().
*/
unsigned int request_fn_active;
unsigned int rq_timeout;
struct timer_list timeout;
struct list_head timeout_list;
struct list_head icq_list;
#ifdef CONFIG_BLK_CGROUP
DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS);
struct blkcg_gq *root_blkg;
struct list_head blkg_list;
#endif
struct queue_limits limits;
/*
* sg stuff
*/
unsigned int sg_timeout;
unsigned int sg_reserved_size;
int node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace *blk_trace;
#endif
/*
* for flush operations
*/
struct blk_flush_queue *fq;
struct list_head requeue_list;
spinlock_t requeue_lock;
struct work_struct requeue_work;
struct mutex sysfs_lock;
int bypass_depth;
int mq_freeze_depth;
#if defined(CONFIG_BLK_DEV_BSG)
bsg_job_fn *bsg_job_fn;
int bsg_job_size;
struct bsg_class_device bsg_dev;
#endif
#ifdef CONFIG_BLK_DEV_THROTTLING
/* Throttle data */
struct throtl_data *td;
#endif
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
struct percpu_ref mq_usage_counter;
struct list_head all_q_node;
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
unsigned long bw_timestamp;
unsigned long last_ticks;
sector_t last_sects[2];
unsigned long last_ios[2];
sector_t disk_bw;
unsigned long disk_iops;
};