块设备驱动(Linux kernel 4.9.x)
主要结构
- gendisk结构体:表示一个独立的磁盘设备(或分区)
1.1 定义如下:
struct gendisk {
/* major, first_minor and minors are input parameters only,
* don't use directly. Use disk_devt() and disk_max_parts().
*/
int major; /* major number of driver */
int first_minor;
int minors; /* maximum number of minors, =1 for
* disks that can't be partitioned. */
char disk_name[DISK_NAME_LEN]; /* name of major driver */
char *(*devnode)(struct gendisk *gd, umode_t *mode);
unsigned int events; /* supported events */
unsigned int async_events; /* async events, subset of all */
/* Array of pointers to partitions indexed by partno.
* Protected with matching bdev lock but stat and other
* non-critical accesses use RCU. Always access through
* helpers.
*/
struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
const struct block_device_operations *fops;
struct request_queue *queue;
void *private_data;
...
};
1.2 一组操作gendisk函数
/* 分配gendisk */
struct gendisk *alloc_disk(int minors);
/* 添加gendisk */
void device_add_disk(struct device *parent, struct gendisk *disk);
void add_disk(struct gendisk *disk);
/* 设置gendisk容量 */
void set_capacity(struct gendisk *disk, sector_t size);
/* 释放gendisk */
void del_gendisk(struct gendisk *gp);
- request、request_queue和bio结构
2.1 request结构体
struct request {
struct list_head queuelist;
union {
struct call_single_data csd;
u64 fifo_time;
};
struct request_queue *q;
struct blk_mq_ctx *mq_ctx;
int cpu;
unsigned cmd_type;
unsigned int cmd_flags; /* op and common flags */
req_flags_t rq_flags;
unsigned long atomic_flags;
/* the following two fields are internal, NEVER access directly */
unsigned int __data_len; /* total data len */
sector_t __sector; /* sector cursor */
struct bio *bio;
struct bio *biotail;
...
...
struct gendisk *rq_disk;
struct hd_struct *part;
unsigned long start_time;
...
...
unsigned short ioprio;
void *special; /* opaque pointer available for LLD use */
int tag;
int errors;
/*
* when request is used as a packet command carrier
*/
unsigned char __cmd[BLK_MAX_CDB];
unsigned char *cmd;
unsigned short cmd_len;
unsigned int extra_len; /* length of alignment and padding */
unsigned int sense_len;
unsigned int resid_len; /* residual count */
void *sense;
unsigned long deadline;
struct list_head timeout_list;
unsigned int timeout;
int retries;
/*
* completion callback.
*/
rq_end_io_fn *end_io;
void *end_io_data;
/* for bidi */
struct request *next_rq;
};
2.2 request_queue结构体
struct request_queue {
/*
* Together with queue_head for cacheline sharing
*/
struct list_head queue_head;
struct request *last_merge;
struct elevator_queue *elevator;
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other
* blkgs from their own blkg->rl. Which one to use should be
* determined using bio_request_list().
*/
struct request_list root_rl;
request_fn_proc *request_fn;
make_request_fn *make_request_fn;
prep_rq_fn *prep_rq_fn;
unprep_rq_fn *unprep_rq_fn;
softirq_done_fn *softirq_done_fn;
rq_timed_out_fn *rq_timed_out_fn;
dma_drain_needed_fn *dma_drain_needed;
lld_busy_fn *lld_busy_fn;
init_rq_fn *init_rq_fn;
exit_rq_fn *exit_rq_fn;
struct blk_mq_ops *mq_ops;
unsigned int *mq_map;
/* sw queues */
struct blk_mq_ctx __percpu *queue_ctx;
unsigned int nr_queues;
/* hw dispatch queues */
struct blk_mq_hw_ctx **queue_hw_ctx;
unsigned int nr_hw_queues;
/*
* Dispatch queue sorting
*/
sector_t end_sector;
struct request *boundary_rq;
...
/*
* The queue owner gets to use this for whatever they like.
* ll_rw_blk doesn't touch it.
*/
void *queuedata;
/*
* various queue flags, see QUEUE_* below
*/
unsigned long queue_flags;
/*
* ida allocated id for this queue. Used to index queues from
* ioctx.
*/
int id;
/*
* queue needs bounce pages for pages above this limit
*/
gfp_t bounce_gfp;
/*
* protects queue structures from reentrancy. ->__queue_lock should
* _never_ be used directly, it is queue private. always use
* ->queue_lock.
*/
spinlock_t __queue_lock;
spinlock_t *queue_lock;
/*
* queue kobject
*/
struct kobject kobj;
/*
* mq queue kobject
*/
struct kobject mq_kobj;
...
...
/*
* queue settings
*/
unsigned long nr_requests; /* Max # of requests */
unsigned int nr_congestion_on;
unsigned int nr_congestion_off;
unsigned int nr_batching;
unsigned int dma_drain_size;
void *dma_drain_buffer;
unsigned int dma_pad_mask;
unsigned int dma_alignment;
struct blk_queue_tag *queue_tags;
struct list_head tag_busy_list;
unsigned int nr_sorted;
unsigned int in_flight[2];
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
* queue_lock internally, e.g. scsi_request_fn().
*/
unsigned int request_fn_active;
unsigned int rq_timeout;
struct timer_list timeout;
struct work_struct timeout_work;
struct list_head timeout_list;
struct list_head icq_list;
...
...
struct queue_limits limits;
/*
* sg stuff
*/
unsigned int sg_timeout;
unsigned int sg_reserved_size;
int node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
struct blk_trace *blk_trace;
#endif
/*
* for flush operations
*/
struct blk_flush_queue *fq;
struct list_head requeue_list;
spinlock_t requeue_lock;
struct delayed_work requeue_work;
struct mutex sysfs_lock;
int bypass_depth;
atomic_t mq_freeze_depth;
...
...
struct rcu_head rcu_head;
wait_queue_head_t mq_freeze_wq;
struct percpu_ref q_usage_counter;
struct list_head all_q_node;
struct blk_mq_tag_set *tag_set;
struct list_head tag_set_list;
struct bio_set *bio_split;
bool mq_sysfs_init_done;
size_t cmd_size;
void *rq_alloc_data;
};
2.3 bio结构体
struct bio {
struct bio *bi_next; /* request queue link */
struct block_device *bi_bdev;
int bi_error;
unsigned int bi_opf; /* bottom bits req flags,
* top bits REQ_OP. Use
* accessors.
*/
unsigned short bi_flags; /* status, command, etc */
unsigned short bi_ioprio;
struct bvec_iter bi_iter;
/* Number of segments in this BIO after
* physical address coalescing is performed.
*/
unsigned int bi_phys_segments;
/*
* To keep track of the max segment size, we account for the
* sizes of the first and last mergeable segments in this bio.
*/
unsigned int bi_seg_front_size;
unsigned int bi_seg_back_size;
atomic_t __bi_remaining;
bio_end_io_t *bi_end_io;
void *bi_private;
...
...
unsigned short bi_vcnt; /* how many bio_vec's */
/*
* Everything starting with bi_max_vecs will be preserved by bio_reset()
*/
unsigned short bi_max_vecs; /* max bvl_vecs we can hold */
atomic_t __bi_cnt; /* pin count */
struct bio_vec *bi_io_vec; /* the actual vec list */
struct bio_set *bi_pool;
/*
* We can inline a number of vecs at the end of the bio, to avoid
* double allocations for a small number of bio_vecs. This member
* MUST obviously be kept at the very end of the bio.
*/
struct bio_vec bi_inline_vecs[0];
};
2.4 bio_vec结构体
struct bio_vec {
struct page *bv_page;
unsigned int bv_len;
unsigned int bv_offset;
};
- block_device_operations结构体:类似于字符设备中的file_operations结构体
struct block_device_operations {
int (*open) (struct block_device *, fmode_t);
void (*release) (struct gendisk *, fmode_t);
int (*rw_page)(struct block_device *, sector_t, struct page *, bool);
int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *,
long);
unsigned int (*check_events) (struct gendisk *disk,
unsigned int clearing);
/* ->media_changed() is DEPRECATED, use ->check_events() instead */
int (*media_changed) (struct gendisk *);
void (*unlock_native_capacity) (struct gendisk *);
int (*revalidate_disk) (struct gendisk *);
int (*getgeo)(struct block_device *, struct hd_geometry *);
/* this callback is with swap_lock and sometimes page table lock held */
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
struct module *owner;
const struct pr_ops *pr_ops;
};
其中int (*getgeo)(struct block_device *, struct hd_geometry *);是用来获取驱动器的信息,hd_geometry结构体中包含磁头、扇面、柱面等信息。
驱动的注册/注销、加载/卸载函数
- 注册与注销
int register_blkdev(unsigned int major, const char *name);
int unregister_blkdev(unsigned int major, const char *name);
-
加载
1)注册块设备;
2)
分配gendisk、初始化(设置)gendisk;
设置gendisk容量;
初始化请求队列、设置队列;
3)添加gendisk。
NOTE: 2)项内容并无前后顺序。 -
卸载
1)释放gendisk(若有对gendisk的引用,也要释放);
2)移除请求队列;
3)注销块设备;
NOTE: 以上三步并无前后顺序。 -
实例:以drivers/block/z2ram.c为例(省略返回值判断)
static int __init z2_init(void)
{
int ret;
...
if (register_blkdev(Z2RAM_MAJOR, DEVICE_NAME)) /* 注册块设备 */
goto err;
ret = -ENOMEM;
z2ram_gendisk = alloc_disk(1); /* 分配gendisk */
...
z2_queue = blk_init_queue(do_z2_request, &z2ram_lock); /* 初始化请求队列 */
...
z2ram_gendisk->major = Z2RAM_MAJOR;
z2ram_gendisk->first_minor = 0;
z2ram_gendisk->fops = &z2_fops;
sprintf(z2ram_gendisk->disk_name, "z2ram");
z2ram_gendisk->queue = z2_queue;
add_disk(z2ram_gendisk); /* 添加gendisk */
blk_register_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT, THIS_MODULE,
z2_find, NULL, NULL);
return 0;
...
}
static void __exit z2_exit(void)
{
int i, j;
blk_unregister_region(MKDEV(Z2RAM_MAJOR, 0), Z2MINOR_COUNT);
unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME); /* 注销块设备 */
del_gendisk(z2ram_gendisk); /* 释放gendisk */
put_disk(z2ram_gendisk); /* 释放对gendisk的引用 */
blk_cleanup_queue(z2_queue); /* 移除请求队列 */
...
...
return;
}
块设备的open、release、ioctl等函数
该部分指的是:前面介绍的block_device_operations结构体下指向的函数。
块设备的I/O请求处理
- 使用请求队列:对于机械的磁盘设备,有助于提高系统的性能。
- 不使用请求队列:如存储卡、RAM盘等完全可真正随机访问的设备,无法从请求队列逻辑获益的块设备。
实例
- Ramdisk:利用内存(RAM)模拟磁盘,数据实际存储在内存中,以块设备的方式访问内存。
/* Ram backed block device driver(drivers/block/brd.c) —不使用请求队列的块设备 */
static struct brd_device *brd_alloc(int i)
{
struct brd_device *brd;
struct gendisk *disk;
brd = kzalloc(sizeof(*brd), GFP_KERNEL);
...
...
brd->brd_queue = blk_alloc_queue(GFP_KERNEL); /* 分配“请求队列” */
...
blk_queue_make_request(brd->brd_queue, brd_make_request); /* 绑定“制造请求”函数 */
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);
brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);
brd->brd_queue->limits.discard_zeroes_data = 1;
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
...
disk = brd->brd_disk = alloc_disk(max_part);
...
disk->major = RAMDISK_MAJOR;
disk->first_minor = i * max_part;
disk->fops = &brd_fops;
disk->private_data = brd;
disk->queue = brd->brd_queue;
disk->flags = GENHD_FL_EXT_DEVT;
sprintf(disk->disk_name, "ram%d", i);
set_capacity(disk, rd_size * 2);
return brd;
...
}
/* 加载函数 */
static int __init brd_init(void)
{
struct brd_device *brd, *next;
int i;
...
...
if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
return -EIO;
...
for (i = 0; i < rd_nr; i++) {
brd = brd_alloc(i);
...
list_add_tail(&brd->brd_list, &brd_devices);
}
/* point of no return */
list_for_each_entry(brd, &brd_devices, brd_list)
add_disk(brd->brd_disk);
blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,
THIS_MODULE, brd_probe, NULL, NULL);
pr_info("brd: module loaded\n");
return 0;
}
- IDE(Integrated Drive Electronics):集成驱动器电路。原名ATA接口,本意为将硬盘控制器与盘体集成在一起的硬盘驱动器。
NOTE:关于IDE的代码均在内核目录drivers/ide/下。
小结
-
块设备和字符设备的I/O操作区别
1)块设备只能以块为单位进行输入/输出;而字符设备则以字节为单位。
2)块设备对于I/O请求有缓存区,可以选择以什么顺序进行响应;而字符设备则无需缓冲区,直接读写。
—块设备的I/O操作中贯穿“请求”,会排队和组合。
3)块设备可以随机访问(对于磁盘,组织顺序访问有助于提高访问效率);而字符设备只能顺序读写。 -
块设备驱动程序
1)驱动的任务—处理请求。I/O调度算法解决请求的排队和整合。
2)驱动的核心—请求处理函数、“制造请求”函数。
3)驱动虽包含block_device_operations结构体(类似于字符设备中的file_operations结构体),但不再包含读写一类的成员函数。仅包含打开、释放及I/O 控制等与具体读写无关的函数。