快设备驱动
标签:操作系统
基础知识汇总
1.块设备:块设备是i/o设备中的一类,可以将信息存储在固定大小的块中(每个块有自己的地址空间),也可以在任意位置读取一定长度的数据。
2.块设备驱动程序:为支持以块的方式进行读写的设备服务的驱动程序。
3.简单hello world模块的编写、编译、加载和卸载
(1)moc.c
(2)Makefile
(3)make
(4)加载
(5)卸载
4。i/o调度器
noop - 通常用于内存存储的设备。
cfq - 完全公平调度器。进程平均使用IO带宽。
deadline - 针对延迟的调度器,每一个 I/O,都有一个最晚执行时间。
anticipatory - 启发式调度,类似 Deadline 算法,但是引入预测机制提高性能。
5.在加载模块时,根据模块的ELF信息确定这个模块所需的静态内存大小。
6.低端内存、高端内存和非线性映射区域
地址空间大于1G的内存区域称之为高端内存,小于1G的内存区域称之为低端内存。高端内存的管理需要进行非线性映射
简单的块驱动程序
主机配置
腾讯云主机:1 核 2 GB
Linux版本:CentOS Linux release 7.4.1708 (Core)
内核版本:3.10.0-693.el7.x86_64
simp_blkdev.c
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/genhd.h>
#include <linux/fs.h>
#include<linux/blkdev.h>
#define SIMP_BLKDEV_DEVICEMAJOR COMPAQ_SMART2_MAJOR //抢占设备号定义的宏
#define SIMP_BLKDEV_DISKNAME "simp_blkdev" //设备块名字
#define SIMP_BLKDEV_BYTES (16*1024*1024) //定义块设备的大小
static struct request_queue *simp_blkdev_queue; //全局变量指向块设备需要的请求队列
static struct gendisk *simp_blkdev_disk;
static DEFINE_SPINLOCK(rq_lock);
unsigned char simp_blkdev_data[SIMP_BLKDEV_BYTES];
static void simp_blkdev_do_request(struct request_queue *q);
struct block_device_operations simp_blkdev_fops = //gendisk结构需要设置fops指针
{
.owner = THIS_MODULE,
};
static void simp_blkdev_do_request(struct request_queue *q)
{
struct request *req;
while ((req = blk_fetch_request(q)) != NULL) {
if (( blk_rq_pos(req) + blk_rq_sectors(req)) << 9
> SIMP_BLKDEV_BYTES) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": bad request: block=%llu, count=%u\n",
(unsigned long long)blk_rq_pos(req),
blk_rq_cur_sectors(req));
__blk_end_request_all(req,0);
continue;
}
switch (rq_data_dir(req)) {
case READ:
memcpy(req->buffer,
simp_blkdev_data + (blk_rq_pos(req) << 9),
blk_rq_cur_sectors( req) << 9);
__blk_end_request_all(req,1);
break;
case WRITE:
memcpy(simp_blkdev_data + ( blk_rq_pos(req) << 9),
req->buffer, blk_rq_cur_sectors(req) << 9);
__blk_end_request_all(req, 1);
break;
default:
/* No default because rq_data_dir(req) is 1 bit */
break;
}
}
}
static int __init simp_blkdev_init(void) //load modules
{
int ret;
simp_blkdev_queue = blk_init_queue(simp_blkdev_do_request, NULL); //加载模块时请求一个请求队列
if (!simp_blkdev_queue) {
ret = -ENOMEM;
goto err_init_queue;
}
simp_blkdev_disk = alloc_disk(1);
if (!simp_blkdev_disk) {
ret = -ENOMEM;
goto err_alloc_disk;
}
strcpy(simp_blkdev_disk->disk_name, SIMP_BLKDEV_DISKNAME);
simp_blkdev_disk->major = SIMP_BLKDEV_DEVICEMAJOR;
simp_blkdev_disk->first_minor = 0;
simp_blkdev_disk->fops = &simp_blkdev_fops;
simp_blkdev_disk->queue = simp_blkdev_queue;
set_capacity(simp_blkdev_disk, SIMP_BLKDEV_BYTES>>9);
add_disk(simp_blkdev_disk);
printk("simp_blkdev success added.\n"); //输出成功加载信息
return 0;
err_alloc_disk:
blk_cleanup_queue(simp_blkdev_queue);
err_init_queue:
return ret;
}
static void __exit simp_blkdev_exit(void) //unload rhe modules
{
del_gendisk(simp_blkdev_disk);
put_disk(simp_blkdev_disk);
blk_cleanup_queue(simp_blkdev_queue);
printk("simp_blkdev success removed.\n"); //输出simp_blkdev卸载成功信息
}
module_init(simp_blkdev_init);
module_exit(simp_blkdev_exit);
Makefile
obj-m := simp_blkdev.o
KDIR := /lib/modules/$(shell uname -r)/build
PWD := $(shell pwd)
default:
$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) modules
clean:
$(MAKE) -C $(KDIR) SUBDIRS=$(PWD) clean
rm -rf Module.markers modules.order Module.symvers
遇到的问题
(1)blkdev.h从2.6.31内核开始,一些api发生变化,主要有:
request -> sector 变为 blk_rq_pos(request)
request -> nr_sector 变为 blk_rq_sectors(request)
end_request(request, error) 变为 blk_end_request_end(request,error)
(2)加载simp_blkdev.ko后主机死机
解决方案:将blk_end_request_all(req, 1); 改为__blk_end_request_all(req, 1);两者的区别在于前者会去获取队列锁,后者不会去获取队列锁,用前者会导致系统死锁,并使系统崩溃(我在测试的时候就是这样).后者无返回值,前者返回bool
加载成功截图
给块设备驱动程序选择一个简单的i/o调度器
修改static int __init simp_blkdev_init(void)
static int __init simp_blkdev_init(void) //load modules
{
int ret;
struct elevator_queue *elv_old;
simp_blkdev_queue = blk_init_queue(simp_blkdev_do_request, NULL); //请求分配一个请求队列
if (!simp_blkdev_queue) {
ret = -ENOMEM;
goto err_init_queue;
}
elv_old = simp_blkdev_queue->elevator;
simp_blkdev_queue->elevator = NULL;
//更换IO调度器
if (IS_ERR_VALUE(elevator_init(simp_blkdev_queue, "noop")))
printk(KERN_WARNING "Switch elevator failed, using default\n");
else
elevator_exit(elv_old);
simp_blkdev_disk = alloc_disk(1); //分配一个gendisk结构
if (!simp_blkdev_disk) {
ret = -ENOMEM;
goto err_alloc_disk;
}
//填充gendisk主要结构成员
strcpy(simp_blkdev_disk->disk_name, SIMP_BLKDEV_DISKNAME);
simp_blkdev_disk->major = SIMP_BLKDEV_DEVICEMAJOR;
simp_blkdev_disk->first_minor = 0;
simp_blkdev_disk->fops = &simp_blkdev_fops;
simp_blkdev_disk->queue = simp_blkdev_queue;
set_capacity(simp_blkdev_disk, SIMP_BLKDEV_BYTES>>9);
add_disk(simp_blkdev_disk);
printk("simp_blkdev success added.\n"); //输出成功加载信息
return 0;
err_alloc_disk:
blk_cleanup_queue(simp_blkdev_queue);
err_init_queue:
return ret;
}
主要问题
使用原教程后insmod后死机
解决方法: 如果要用函数elevator_init来设置调度器,就必须先保存原来blk_init_queue()帮我们申请的默认调度器,然后设置队列中的指向调度器的elevator指针为NULL(必须设置指针为NULL,要不然函数elevator_init会不做任何事情直接返回),最后才是调用elevator_init(simp_blkdev_queue, “noop”)来设置。
没有IO调度器的内存块设备驱动
源代码:
#include<linux/init.h>
#include<linux/module.h>
#include<linux/genhd.h>
#include<linux/fs.h>
#include<linux/blkdev.h>
#include<linux/bio.h>
#include<linux/version.h>
#define SIMP_BLKDEV_DISKNAME "simp_blkdev"
#define SIMP_BLKDEV_DEVICEMAJOR COMPAQ_SMART2_MAJOR
#define SIMP_BLKDEV_BYTES (8*1024*1024)
static DEFINE_SPINLOCK(rq_lock);
unsigned char simp_blkdev_data[SIMP_BLKDEV_BYTES];
static struct gendisk *simp_blkdev_disk;
static struct request_queue *simp_blkdev_queue;//device's request queue
struct block_device_operations simp_blkdev_fops = {
.owner = THIS_MODULE,
};
//handle bio
static int simp_blkdev_make_request(struct request_queue *q, struct bio *bio){
struct bio_vec *bvec;
int i;
void *dsk_mem;
if( (bio->bi_sector << 9) + bio->bi_size > SIMP_BLKDEV_BYTES){
printk(KERN_ERR SIMP_BLKDEV_DISKNAME ":bad request:block=%llu,count=%u\n",(unsigned long long)bio->bi_sector,bio->bi_size);
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 24)
bio_endio(bio, 0, -EIO);
#else
bio_endio(bio, -EIO);
#endif
return 0;
}
dsk_mem = simp_blkdev_data + (bio->bi_sector << 9);
bio_for_each_segment(bvec, bio, i){
void *iovec_mem;
switch( bio_rw(bio) ){
case READ:
case READA:
iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset;
memcpy(iovec_mem, dsk_mem, bvec->bv_len);
kunmap(bvec->bv_page);
break;
case WRITE:
iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset;
memcpy(dsk_mem, iovec_mem, bvec->bv_len);
kunmap(bvec->bv_page);
break;
default:
printk(KERN_ERR SIMP_BLKDEV_DISKNAME ": unknown value of bio_rw: %lu\n", bio_rw(bio));
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 24)
bio_endio(bio, 0, -EIO);
#else
bio_endio(bio, -EIO);
#endif
return 0;
}
dsk_mem += bvec->bv_len;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 24)
bio_endio(bio, bio->bi_size, 0);
#else
bio_endio(bio, 0);
#endif
return 0;
}
static int __init simp_blkdev_init(void){
int ret;
simp_blkdev_queue = blk_alloc_queue(GFP_KERNEL);
if(!simp_blkdev_queue){
ret = -ENOMEM;
goto error_alloc_queue;
}
blk_queue_make_request(simp_blkdev_queue, simp_blkdev_make_request);
//alloc the resource of gendisk
simp_blkdev_disk = alloc_disk(1);
if(!simp_blkdev_disk){
ret = -ENOMEM;
goto error_alloc_disk;
}
//populate the gendisk structure
strcpy(simp_blkdev_disk->disk_name,SIMP_BLKDEV_DISKNAME);
simp_blkdev_disk->major = SIMP_BLKDEV_DEVICEMAJOR;
simp_blkdev_disk->first_minor = 0;
simp_blkdev_disk->fops = &simp_blkdev_fops;
simp_blkdev_disk->queue = simp_blkdev_queue;
set_capacity(simp_blkdev_disk,SIMP_BLKDEV_BYTES>>9);
add_disk(simp_blkdev_disk);
printk("module simp_blkdev added.\n");
return 0;
error_alloc_queue:
blk_cleanup_queue(simp_blkdev_queue);
error_alloc_disk:
return ret;
}
static void simp_blkdev_exit(void){
del_gendisk(simp_blkdev_disk);
put_disk(simp_blkdev_disk);
blk_cleanup_queue(simp_blkdev_queue);
printk("module simp_blkdev romoved.\n");
}
module_init(simp_blkdev_init);
module_exit(simp_blkdev_exit);
遇到的问题
将的simp_blkdev_make_request()函数替换掉simp_blkdev_do_request()函数后,但是驱动程序在sysfs目录中的queue子目录仍然存在,百度谷歌不得求解
支持多个分区
//在文件的头部增加
#define SIMP_BLKDEV_MAXPARTITIONS (64)
simp_blkdev_disk = alloc_disk(1);
//替换改成
simp_blkdev_disk = alloc_disk(SIMP_BLKDEV_MAXPARTITIONS);
多个磁道
//增加头文件
#include<linux/hdreg.h>
//增加getgeo成员变量初值的设定,并指向或者物理结构的函数
struct block_device_operations simp_blkdev_fops = {
.owner = THIS_MODULE,
.getgeo = simp_blkdev_getgeo,
};
//实现获得块设备物理结构的函数
static int simp_blkdev_getgeo(struct block_device *bdev,
struct hd_geometry *geo)
{
/* 选择适当的物理结构信息装入struct hd_geometry *geo结构
* * capacity heads sectors cylinders
* * 0~16M 1 1 0~32768
* * 16M~512M 1 32 1024~32768
* * 512M~16G 32 32 1024~32768
* * 16G~... 255 63 2088~...
* */
if (SIMP_BLKDEV_BYTES < 16 * 1024 * 1024) {
geo->heads = 1;
geo->sectors = 1;
} else if (SIMP_BLKDEV_BYTES < 512 * 1024 * 1024) {
geo->heads = 1;
geo->sectors = 32;
} else if (SIMP_BLKDEV_BYTES < 16ULL * 1024 * 1024 * 1024) {
geo->heads = 32;
geo->sectors = 32;
} else {
geo->heads = 255;
geo->sectors = 63;
}
geo->cylinders = SIMP_BLKDEV_BYTES>>9/geo->heads/geo->sectors;
return 0;
}
清除simp_blkdev_data
unsigned char simp_blkdev_data[SIMP_BLKDEV_BYTES]; //准备去掉
//引入基树代替上面的数组
static struct radix_tree_root simp_blkdev_data;
//修改simp_blkdev_make_request函数并简化
static void simp_blkdev_make_request(struct request_queue *q, struct bio *bio){
struct bio_vec *bvec;
int i;
unsigned long long dsk_offset;
dsk_offset = bio->bi_sector * 512;
bio_for_each_segment(bvec, bio, i) {
unsigned int count_done, count_current;
void *iovec_mem;
void *dsk_mem;
iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset;
count_done = 0;
while (count_done < bvec->bv_len) {
count_current = min(bvec->bv_len - count_done, PAGE_SIZE - (dsk_offset + count_done) % PAGE_SIZE);
dsk_mem = radix_tree_lookup(&simp_blkdev_data, (dsk_offset + count_done) / PAGE_SIZE);
dsk_mem += (dsk_offset + count_done) % PAGE_SIZE;
switch (bio_rw(bio)) {
case READ:
case READA:
memcpy(iovec_mem + count_done, dsk_mem, count_current);
break;
case WRITE:
memcpy(dsk_mem, iovec_mem + count_done, count_current);
break;
}
count_done += count_current;
}
kunmap(bvec->bv_page);
dsk_offset += bvec->bv_len;
}
bio_endio(bio, 0);
return 0;
}
//释放内存函数
void free_diskmem(void)
{
int i;
void *p;
for (i = 0; i < (SIMP_BLKDEV_BYTES + PAGE_SIZE - 1) >> PAGE_SHIFT;
i++) {
p = radix_tree_lookup(&simp_blkdev_data, i);
radix_tree_delete(&simp_blkdev_data, i);
/* free NULL is safe */
free_page((unsigned long)p);
}
//申请内存函数
int alloc_diskmem(void)
{
int ret;
int i;
void *p;
INIT_RADIX_TREE(&simp_blkdev_data, GFP_KERNEL);
for (i = 0; i < (SIMP_BLKDEV_BYTES + PAGE_SIZE - 1) >> PAGE_SHIFT;
i++) {
p = (void *)__get_free_page(GFP_KERNEL);
if (!p) {
ret = -ENOMEM;
goto err_alloc;
}
ret = radix_tree_insert(&simp_blkdev_data, i, p);
if (IS_ERR_VALUE(ret))
goto err_radix_tree_insert;
}
return 0;
err_radix_tree_insert:
free_page((unsigned long)p);
err_alloc:
free_diskmem();
return ret;
}
//修改simp_blkdevdev_init函数
ret = alloc_diskmem();
if (IS_ERR_VALUE(ret))
goto err_alloc_diskmem;
err_alloc_diskmem:
put_disk(simp_blkdev_disk);
//修改simp_blkdev_exit函数
del_gendisk(simp_blkdev_disk);
用户与内核进程公用一个页表
//定义宏
#define SIMP_BLKDEV_DATASEGORDER (2)
#define SIMP_BLKDEV_DATASEGSHIFT (PAGE_SHIFT + SIMP_BLKDEV_DATASEGORDER)
#define SIMP_BLKDEV_DATASEGSIZE (PAGE_SIZE << SIMP_BLKDEV_DATASEGORDER)
#define SIMP_BLKDEV_DATASEGMASK (~(SIMP_BLKDEV_DATASEGSIZE-1))
//修改free_diskmem()和alloc_diskmem()函数见赵磊的教程
/*修改简化的simp_blkdev_make_request代码,
*只要用刚才定义的SIMP_BLKDEV_DATASEGSIZE、
*SIMP_BLKDEV_DATASEGMASK和SIMP_BLKDEV_DATASEGSHIFT
*替换原先代码中的PAGE_SIZE、PAGE_MASK和PAGE_SHIFT就可以*/
while (count_done < bvec->bv_len) {
count_current = min(bvec->bv_len - count_done, SIMP_BLKDEV_DATASEGSIZE - (dsk_offset + count_done) % SIMP_BLKDEV_DATASEGSIZE);
dsk_mem = radix_tree_lookup(&simp_blkdev_data, (dsk_offset + count_done) / SIMP_BLKDEV_DATASEGSIZE);
dsk_mem += (dsk_offset + count_done) % SIMP_BLKDEV_DATASEGSIZE;
switch (bio_rw(bio)) {
case READ:
case READA:
memcpy(iovec_mem + count_done, dsk_mem, count_current);
break;
case WRITE:
memcpy(dsk_mem, iovec_mem + count_done, count_current);
break;
}
count_done += count_current;
}
kunmap(bvec->bv_page);
dsk_offset += bvec->bv_len;
}
给模块加参数
//添加参数申明
static char *simp_blkdev_param_size = "16M";
module_param_named(size, simp_blkdev_param_size, charp, S_IRUGO);
//增加变量存储用户设定的磁盘的大小
static unsigned long long simp_blkdev_bytes;
//去掉定义
#define SIMP_BLKDEV_BYTES (16*1024*1024)
//同时将SIMP_BLKDEV_BYTES换成simp_blkdev_bytes变量
//在simp_blkdev_init前定义getparam函数并在其中调用
static int __init simp_blkdev_init(void){
int ret;
ret = getparam();
if (IS_ERR_VALUE(ret))
goto err_getparam;
//。。。省略
//。。。。。
err_getparam:
return ret;
}
在驱动程序加载后块设备的数据值全为0
//解决扇区的数据长度的转换,定义
#define SIMP_BLKDEV_SECTORSHIFT (9)
#define SIMP_BLKDEV_SECTORSIZE (1ULL<<SIMP_BLKDEV_SECTORSHIFT)
#define SIMP_BLKDEV_SECTORMASK (~(SIMP_BLKDEV_SECTORSIZE-1))
//把simp_blkdev_make_request()函数、simp_blkdev_init()函数中的9换成 SIMP_BLKDEV_SECTORSHIFT
//getparam()函数中
simp_blkdev_bytes = (simp_blkdev_bytes + (1<<9) - 1) & ~((1ULL<<9) - 1);
//改成
simp_blkdev_bytes = (simp_blkdev_bytes + SIMP_BLKDEV_SECTORSIZE - 1)
& SIMP_BLKDEV_SECTORMASK;
//在alloc_diskmem()函数中添加__GFP_ZERO标志
p = (void *)__get_free_pages(GFP_KERNEL| __GFP_ZERO,
SIMP_BLKDEV_DATASEGORDER);
实现对高端内存的支持
由于本机器是64位linux内核
知识补充
1.32位Linux内核虚拟地址空间划分0~3G为用户空间,3~4G为内核空间(注意,内核可以使用的线性地址只有1G)。
2.Linux将内核地址空间划分为三部分ZONE_DMA、ZONE_NORMAL和ZONE_HIGHMEM,高端内存HIGH_MEM地址空间范围为 0xF8000000 ~ 0xFFFFFFFF(896MB~1024MB)。
3.当内核想访问高于896MB物理地址内存时,从0xF8000000 ~ 0xFFFFFFFF地址空间范围内找一段相应大小空闲的逻辑地址空间,借用一会。借用这段逻辑地址空间,建立映射到想访问的那段物理内存(即填充内核PTE页面表),临时用一会,用完后归还。这样别人也可以借用这段地址空间访问其他物理内存,实现了使用有限的地址空间,访问所有所有物理内存。
4.64位Linux内核不存在高端内存,因为64位内核可以支持超过512GB内存。若机器安装的物理内存超过内核>地址空间范围,就会存在高端内存。
修改free_diskmem()函数改进性能
//通过radix_tree_gang_lookup()函数可以一次从基树中获取多个节点的信息
//page结构的index成员在该页用作页高速缓存时存储相对文件起始处的以页大小为单位的偏移
//我们所使用的页面不会被同时用作页高速缓存,因此这里可以借用page.index成员
void free_diskmem(void)
{
unsigned long long next_seg;
struct page *seglist[64];
int listcnt;
int i;
next_seg = 0;
do {
listcnt = radix_tree_gang_lookup(&simp_blkdev_data,
(void **)seglist, next_seg, ARRAY_SIZE(seglist));
for (i = 0; i < listcnt; i++) {
next_seg = seglist[i]->index;
radix_tree_delete(&simp_blkdev_data, next_seg);
__free_pages(seglist[i], SIMP_BLKDEV_DATASEGORDER);
}
next_seg++;
} while (listcnt == ARRAY_SIZE(seglist));
}
//alloc_diskmem()函数中把基树的索引存入page.index
page->index = i
内存的推迟分配
前面是在alloc_disk时候就将内存全部分配了,在本章中是在读写函数中处理,当开始读或写的时候,查找有没有找到page,如果没有才开始分配映射
//删除alloc_diskmem()函数
//删除在simp_blkdev_init()函数中的调用
//修改simp_blkdev_trans()函数中的查找基树中的一个内存块
this_first_page = radix_tree_lookup(&simp_blkdev_data,
(dsk_offset + done_cnt) >> SIMP_BLKDEV_DATASEGSHIFT);
if (!this_first_page) {
if (!dir) {
memset(buf + done_cnt, 0, this_cnt);
goto trans_done;
}
/* prepare new memory segment for write */
this_first_page = alloc_pages(
GFP_KERNEL | __GFP_ZERO | __GFP_HIGHMEM,
SIMP_BLKDEV_DATASEGORDER);
if (!this_first_page) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": allocate page failed\n");
return -ENOMEM;
}
this_first_page->index = (dsk_offset + done_cnt)
>> SIMP_BLKDEV_DATASEGSHIFT;
if (IS_ERR_VALUE(radix_tree_insert(&simp_blkdev_data,
this_first_page->index, this_first_page))) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": insert page to radix_tree failed"
" seg=%lu\n", this_first_page->index);
__free_pages(this_first_page,
SIMP_BLKDEV_DATASEGORDER);
return -EIO;
}
}
加锁解决数据访问冲突
在simp_blkdev_trans()函数中的simp_blkdev_datalock的操作加锁,free_diskmem()函数中可以不进行加锁。因为对内核在执行对块设备设备时,会锁住这个设备对应的模块。
simp_blkdev.c最终代码
#include<linux/init.h>
#include<linux/module.h>
#include<linux/genhd.h>
#include<linux/fs.h>
#include<linux/blkdev.h>
#include<linux/bio.h>
#include<linux/version.h>
#include<linux/hdreg.h>
#define SIMP_BLKDEV_DATASEGORDER (2)
#define SIMP_BLKDEV_DATASEGSHIFT (PAGE_SHIFT + SIMP_BLKDEV_DATASEGORDER)
#define SIMP_BLKDEV_DATASEGSIZE (PAGE_SIZE << SIMP_BLKDEV_DATASEGORDER)
#define SIMP_BLKDEV_DATASEGMASK (~(SIMP_BLKDEV_DATASEGSIZE-1))
#define SIMP_BLKDEV_DISKNAME "simp_blkdev"
#define SIMP_BLKDEV_DEVICEMAJOR COMPAQ_SMART2_MAJOR
#define SIMP_BLKDEV_MAXPARTITIONS (64)
//进行扇区之间的转换
#define SIMP_BLKDEV_SECTORSHIFT (9)
#define SIMP_BLKDEV_SECTORSIZE (1ULL<<SIMP_BLKDEV_SECTORSHIFT)
#define SIMP_BLKDEV_SECTORMASK (~(SIMP_BLKDEV_SECTORSIZE-1))
static DEFINE_SPINLOCK(rq_lock);
static struct radix_tree_root simp_blkdev_data; //引入基树来代替数组
DEFINE_MUTEX(simp_blkdev_datalock); //引入锁用来保护数据,解决数据访问冲突
static struct gendisk *simp_blkdev_disk;
static struct request_queue *simp_blkdev_queue; //device's request queue
static unsigned long long simp_blkdev_bytes;
//用于在模块加载时候指定大小
static char *simp_blkdev_param_size = "16M";
module_param_named(size, simp_blkdev_param_size, charp, S_IRUGO);
static int simp_blkdev_getgeo(struct block_device *bdev,
struct hd_geometry *geo)
{
/* 选择适当的物理结构信息装入struct hd_geometry *geo结构
* * capacity heads sectors cylinders
* * 0~16M 1 1 0~32768
* * 16M~512M 1 32 1024~32768
* * 512M~16G 32 32 1024~32768
* * 16G~... 255 63 2088~...
* */
if (simp_blkdev_bytes < 16 * 1024 * 1024) {
geo->heads = 1;
geo->sectors = 1;
} else if (simp_blkdev_bytes < 512 * 1024 * 1024) {
geo->heads = 1;
geo->sectors = 32;
} else if (simp_blkdev_bytes < 16ULL * 1024 * 1024 * 1024) {
geo->heads = 32;
geo->sectors = 32;
} else {
geo->heads = 255;
geo->sectors = 63;
}
geo->cylinders = simp_blkdev_bytes>> SIMP_BLKDEV_SECTORSHIFT/geo->heads/geo->sectors;
return 0;
}
//获取块设备的物理结构
struct block_device_operations simp_blkdev_fops = {
.owner = THIS_MODULE,
.getgeo = simp_blkdev_getgeo,
};
//实现对高端内存的操作
static int simp_blkdev_trans_oneseg(struct page *start_page,
unsigned long offset, void *buf, unsigned int len, int dir)
{
unsigned int done_cnt;
struct page *this_page;
unsigned int this_off;
unsigned int this_cnt;
void *dsk_mem;
done_cnt = 0;
while (done_cnt < len) {
/* iterate each page */
this_page = start_page + ((offset + done_cnt) >> PAGE_SHIFT);
this_off = (offset + done_cnt) & ~PAGE_MASK;
this_cnt = min(len - done_cnt, (unsigned int)PAGE_SIZE
- this_off);
dsk_mem = kmap(this_page);
if (!dsk_mem) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": map device page failed: %p\n", this_page);
return -ENOMEM;
}
dsk_mem += this_off;
if (!dir)
memcpy(buf + done_cnt, dsk_mem, this_cnt);
else
memcpy(dsk_mem, buf + done_cnt, this_cnt);
kunmap(this_page);
done_cnt += this_cnt;
}
return 0;
}
static int simp_blkdev_trans(unsigned long long dsk_offset, void *buf,
unsigned int len, int dir)
{
unsigned int done_cnt;
struct page *this_first_page;
unsigned int this_off;
unsigned int this_cnt;
done_cnt = 0;
while (done_cnt < len) {
/* iterate each data segment */
this_off = (dsk_offset + done_cnt) & ~SIMP_BLKDEV_DATASEGMASK;
this_cnt = min(len - done_cnt,
(unsigned int)SIMP_BLKDEV_DATASEGSIZE - this_off);
mutex_lock(&simp_blkdev_datalock);
this_first_page = radix_tree_lookup(&simp_blkdev_data,
(dsk_offset + done_cnt) >> SIMP_BLKDEV_DATASEGSHIFT);
if (!this_first_page) {
if (!dir) {
memset(buf + done_cnt, 0, this_cnt);
goto trans_done;
}
/* prepare new memory segment for write */
this_first_page = alloc_pages(
GFP_KERNEL | __GFP_ZERO | __GFP_HIGHMEM,
SIMP_BLKDEV_DATASEGORDER);
if (!this_first_page) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": allocate page failed\n");
mutex_unlock(&simp_blkdev_datalock);
return -ENOMEM;
}
this_first_page->index = (dsk_offset + done_cnt)
>> SIMP_BLKDEV_DATASEGSHIFT;
if (IS_ERR_VALUE(radix_tree_insert(&simp_blkdev_data,
this_first_page->index, this_first_page))) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": insert page to radix_tree failed"
" seg=%lu\n", this_first_page->index);
__free_pages(this_first_page,
SIMP_BLKDEV_DATASEGORDER);
mutex_unlock(&simp_blkdev_datalock);
return -EIO;
}
}
if (IS_ERR_VALUE(simp_blkdev_trans_oneseg(this_first_page,
this_off, buf + done_cnt, this_cnt, dir))) {
mutex_unlock(&simp_blkdev_datalock);
return -EIO;
}
trans_done:
mutex_unlock(&simp_blkdev_datalock);
done_cnt += this_cnt;
}
return 0;
}
//handle bio
static int simp_blkdev_make_request(struct request_queue *q, struct bio *bio)
{
int dir;
unsigned long long dsk_offset;
struct bio_vec *bvec;
int i;
void *iovec_mem;
switch (bio_rw(bio)) {
case READ:
case READA:
dir = 0;
break;
case WRITE:
dir = 1;
break;
default:
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": unknown value of bio_rw: %lu\n", bio_rw(bio));
goto bio_err;
}
if ((bio->bi_sector << SIMP_BLKDEV_SECTORSHIFT) + bio->bi_size
> simp_blkdev_bytes) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": bad request: block=%llu, count=%u\n",
(unsigned long long)bio->bi_sector, bio->bi_size);
goto bio_err;
}
dsk_offset = bio->bi_sector << SIMP_BLKDEV_SECTORSHIFT;
bio_for_each_segment(bvec, bio, i) {
iovec_mem = kmap(bvec->bv_page) + bvec->bv_offset;
if (!iovec_mem) {
printk(KERN_ERR SIMP_BLKDEV_DISKNAME
": map iovec page failed: %p\n", bvec->bv_page);
goto bio_err;
}
if (IS_ERR_VALUE(simp_blkdev_trans(dsk_offset, iovec_mem,
bvec->bv_len, dir)))
goto bio_err;
kunmap(bvec->bv_page);
dsk_offset += bvec->bv_len;
}
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 24)
bio_endio(bio, bio->bi_size, 0);
#else
bio_endio(bio, 0);
#endif
return 0;
bio_err:
#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 24)
bio_endio(bio, 0, -EIO);
#else
bio_endio(bio, -EIO);
#endif
return 0;
}
//释放内存函数
void free_diskmem(void)
{
unsigned long long next_seg;
struct page *seglist[64];
int listcnt;
int i;
next_seg = 0;
do {
listcnt = radix_tree_gang_lookup(&simp_blkdev_data,
(void **)seglist, next_seg, ARRAY_SIZE(seglist));
for (i = 0; i < listcnt; i++) {
next_seg = seglist[i]->index;
radix_tree_delete(&simp_blkdev_data, next_seg);
__free_pages(seglist[i], SIMP_BLKDEV_DATASEGORDER);
}
next_seg++;
} while (listcnt == ARRAY_SIZE(seglist));
}
//加参数,在__init simp_blkdev_init中调用
//用来在加载模块的时候指定模块的大小
int getparam(void)
{
char unit;
char tailc;
if (sscanf(simp_blkdev_param_size, "%llu%c%c", &simp_blkdev_bytes,
&unit, &tailc) != 2) {
return -EINVAL;
}
if (!simp_blkdev_bytes)
return -EINVAL;
switch (unit) {
case 'g':
case 'G':
simp_blkdev_bytes <<= 30;
break;
case 'm':
case 'M':
simp_blkdev_bytes <<= 20;
break;
case 'k':
case 'K':
simp_blkdev_bytes <<= 10;
break;
case 'b':
case 'B':
break;
default:
return -EINVAL;
}
/* make simp_blkdev_bytes fits sector's size */
simp_blkdev_bytes = (simp_blkdev_bytes + SIMP_BLKDEV_SECTORSIZE - 1)
& SIMP_BLKDEV_SECTORMASK;
return 0;
}
static int __init simp_blkdev_init(void){
int ret;
ret = getparam();
if (IS_ERR_VALUE(ret))
goto err_getparam;
simp_blkdev_queue = blk_alloc_queue(GFP_KERNEL);
if (!simp_blkdev_queue) {
ret = -ENOMEM;
goto err_alloc_queue;
}
blk_queue_make_request(simp_blkdev_queue, simp_blkdev_make_request);
//分配gendisk资源
simp_blkdev_disk = alloc_disk(SIMP_BLKDEV_MAXPARTITIONS);
if (!simp_blkdev_disk) {
ret = -ENOMEM;
goto err_alloc_disk;
}
INIT_RADIX_TREE(&simp_blkdev_data, GFP_KERNEL);
//填充gendisk结构
strcpy(simp_blkdev_disk->disk_name, SIMP_BLKDEV_DISKNAME);
simp_blkdev_disk->major = SIMP_BLKDEV_DEVICEMAJOR;
simp_blkdev_disk->first_minor = 0;
simp_blkdev_disk->fops = &simp_blkdev_fops;
simp_blkdev_disk->queue = simp_blkdev_queue;
set_capacity(simp_blkdev_disk,
simp_blkdev_bytes >> SIMP_BLKDEV_SECTORSHIFT);
add_disk(simp_blkdev_disk);
return 0;
err_alloc_disk:
blk_cleanup_queue(simp_blkdev_queue);
err_alloc_queue:
err_getparam:
return ret;
}
static void simp_blkdev_exit(void){
del_gendisk(simp_blkdev_disk);
free_diskmem();
put_disk(simp_blkdev_disk);
blk_cleanup_queue(simp_blkdev_queue);
printk("module simp_blkdev romoved.\n");
}
module_init(simp_blkdev_init); //load the modules
module_exit(simp_blkdev_exit); //unload the modules