在字符设备发出读写请求的时候,实际上硬件IO就发生了,但是块设备不同,块设备利用一块系统内存作为缓冲区,当用户进程对设备请求能够满足用户的要求时,就返回请求的数据,如果不能就调用请求函数来进行实验IO操作,块设备是对慢速设备设计,防止消耗过多的CPU事件来等待。与字符设备根本的区别在于是否能够被随机访问(在访问设备的时候随意地从一个位置跳转到另一个位置);
简单来讲,块设备通过系统缓存进行读取,不直接和物理磁盘进行读取,他是将读写放入到队列中,优化之后再执行,而字符设备可以直接读取物理磁盘不经过系统缓存;
框架:
应用层:open read write
—————————————————— 文件的读写
文件系统:vfat ext2 ext3 yaffs2 (把文件的读写转换为扇区的读写)
———————ll_rw_block——————扇区的读写 ll_rw_block(1.把读写放入队列 2.调用队列的处理函数(优化/调整顺序/合并))
块设备驱动程序
——————————————————
硬件:硬盘、FLASH
ll_rw_block
void ll_rw_block(int rw, int nr, struct buffer_head *bhs[]) //第一个参数表示读/写,第二个参数表示第三个参数大小,第三个参数表示读写缓冲区
{
int i;
for (i = 0; i < nr; i++) {
struct buffer_head *bh = bhs[i];
if (rw == SWRITE)
lock_buffer(bh);
else if (test_set_buffer_locked(bh))
continue;
if (rw == WRITE || rw == SWRITE) {
if (test_clear_buffer_dirty(bh)) {
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
submit_bh(WRITE, bh);
continue;
}
} else {
if (!buffer_uptodate(bh)) {
bh->b_end_io = end_buffer_read_sync;
get_bh(bh);
submit_bh(rw, bh);
continue;
}
}
unlock_buffer(bh);
}
}
submit_bh函数
int submit_bh(int rw, struct buffer_head * bh)
{
struct bio *bio; //使用bh来构造bio(block input/output)
int ret = 0;
BUG_ON(!buffer_locked(bh));
BUG_ON(!buffer_mapped(bh));
BUG_ON(!bh->b_end_io);
if (buffer_ordered(bh) && (rw == WRITE))
rw = WRITE_BARRIER;
/*
* Only clear out a write error when rewriting, should this
* include WRITE_SYNC as well?
*/
if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER))
clear_buffer_write_io_error(bh);
/*
* from here on down, it's all bio -- do the initial mapping,
* submit_bio -> generic_make_request may further map this bio around
*/
bio = bio_alloc(GFP_NOIO, 1);
bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio->bi_bdev = bh->b_bdev;
bio->bi_io_vec[0].bv_page = bh->b_page;
bio->bi_io_vec[0].bv_len = bh->b_size;
bio->bi_io_vec[0].bv_offset = bh_offset(bh);
bio->bi_vcnt = 1;
bio->bi_idx = 0;
bio->bi_size = bh->b_size;
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
bio_get(bio);
submit_bio(rw, bio);
if (bio_flagged(bio, BIO_EOPNOTSUPP))
ret = -EOPNOTSUPP;
bio_put(bio);
return ret;
}
submit_bio函数:
void submit_bio(int rw, struct bio *bio)
{
int count = bio_sectors(bio);
BIO_BUG_ON(!bio->bi_size);
BIO_BUG_ON(!bio->bi_io_vec);
bio->bi_rw |= rw;
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
current->comm, current->pid,
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev,b));
}
generic_make_request(bio); //通用构造请求:使用bio来构造请求(request),将请求放入队列
}
generic_make_request函数:
void generic_make_request(struct bio *bio)
{
if (current->bio_tail) {
/* make_request is active */
*(current->bio_tail) = bio;
bio->bi_next = NULL;
current->bio_tail = &bio->bi_next;
return;
}
/* following loop may be a bit non-obvious, and so deserves some
* explanation.
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
* we assign bio_list to the next (which is NULL) and bio_tail
* to &bio_list, thus initialising the bio_list of new bios to be
* added. __generic_make_request may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so fixup bio_list and
* bio_tail or bi_next, and call into __generic_make_request again.
*
* The loop was structured like this to make only one call to
* __generic_make_request (which is important as it is large and
* inlined) and to keep the structure simple.
*/
BUG_ON(bio->bi_next);
do {
current->bio_list = bio->bi_next;
if (bio->bi_next == NULL)
current->bio_tail = ¤t->bio_list;
else
bio->bi_next = NULL;
__generic_make_request(bio);
bio = current->bio_list;
} while (bio);
current->bio_tail = NULL; /* deactivate */
}
__generic_make_request函数
static inline void __generic_make_request(struct bio *bio)
{
request_queue_t *q;
sector_t maxsector;
sector_t old_sector;
int ret, nr_sectors = bio_sectors(bio);
dev_t old_dev;
might_sleep();
/* Test device or partition size, when known. */
maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
if (maxsector) {
sector_t sector = bio->bi_sector;
if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
/*
* This may well happen - the kernel calls bread()
* without checking the size of the device, e.g., when
* mounting a device.
*/
handle_bad_sector(bio);
goto end_io;
}
}
/*
* Resolve the mapping until finished. (drivers are
* still free to implement/resolve their own stacking
* by explicitly returning 0)
*
* NOTE: we don't repeat the blk_size check for each new device.
* Stacking drivers are expected to know what they are doing.
*/
old_sector = -1;
old_dev = 0;
do {
char b[BDEVNAME_SIZE];
q = bdev_get_queue(bio->bi_bdev); //获得队列,q为请求队列request_queue_t
if (!q) {
printk(KERN_ERR
"generic_make_request: Trying to access "
"nonexistent block-device %s (%Lu)\n",
bdevname(bio->bi_bdev, b),
(long long) bio->bi_sector);
end_io:
bio_endio(bio, bio->bi_size, -EIO);
break;
}
if (unlikely(bio_sectors(bio) > q->max_hw_sectors)) {
printk("bio too big device %s (%u > %u)\n",
bdevname(bio->bi_bdev, b),
bio_sectors(bio),
q->max_hw_sectors);
goto end_io;
}
if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
goto end_io;
if (should_fail_request(bio))
goto end_io;
/*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
*/
blk_partition_remap(bio);
if (old_sector != -1)
blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
old_sector);
blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
old_sector = bio->bi_sector;
old_dev = bio->bi_bdev->bd_dev;
maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
if (maxsector) {
sector_t sector = bio->bi_sector;
if (maxsector < nr_sectors ||
maxsector - nr_sectors < sector) {
/*
* This may well happen - partitions are not
* checked to make sure they are within the size
* of the whole device.
*/
handle_bad_sector(bio);
goto end_io;
}
}
ret = q->make_request_fn(q, bio); //调用队列中的构造请求的函数
} while (ret);
}
make_request_fn的默认请求函数是__make_request
static int __make_request(request_queue_t *q, struct bio *bio)
{
struct request *req;
int el_ret, nr_sectors, barrier, err;
const unsigned short prio = bio_prio(bio);
const int sync = bio_sync(bio);
int rw_flags;
nr_sectors = bio_sectors(bio);
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
blk_queue_bounce(q, &bio);
barrier = bio_barrier(bio);
if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
err = -EOPNOTSUPP;
goto end_io;
}
spin_lock_irq(q->queue_lock);
if (unlikely(barrier) || elv_queue_empty(q))
goto get_rq;
el_ret = elv_merge(q, &req, bio); //使用电梯调度算法合并,将bio合并到队列q中
switch (el_ret) {
case ELEVATOR_BACK_MERGE:
BUG_ON(!rq_mergeable(req));
if (!ll_back_merge_fn(q, req, bio))
break;
blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
req->biotail->bi_next = bio;
req->biotail = bio;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0);
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
case ELEVATOR_FRONT_MERGE:
BUG_ON(!rq_mergeable(req));
if (!ll_front_merge_fn(q, req, bio))
break;
blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
bio->bi_next = req->bio;
req->bio = bio;
/*
* may not be valid. if the low level driver said
* it didn't need a bounce buffer then it better
* not touch req->buffer either...
*/
req->buffer = bio_data(bio);
req->current_nr_sectors = bio_cur_sectors(bio);
req->hard_cur_sectors = req->current_nr_sectors;
req->sector = req->hard_sector = bio->bi_sector;
req->nr_sectors = req->hard_nr_sectors += nr_sectors;
req->ioprio = ioprio_best(req->ioprio, prio);
drive_stat_acct(req, nr_sectors, 0);
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out;
/* ELV_NO_MERGE: elevator says don't/can't merge. */
default:
;
}
get_rq:
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
* rq allocator and io schedulers.
*/
rw_flags = bio_data_dir(bio);
if (sync)
rw_flags |= REQ_RW_SYNC;
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
req = get_request_wait(q, rw_flags, bio);
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
init_request_from_bio(req, bio); //如果合并不成功使用bio构造请求
spin_lock_irq(q->queue_lock);
if (elv_queue_empty(q))
blk_plug_device(q);
add_request(q, req); //不能合并,把请求加入到队列当中去
out:
if (sync)
__generic_unplug_device(q); //执行队列,并不一定在这个地方执行
spin_unlock_irq(q->queue_lock);
return 0;
end_io:
bio_endio(bio, nr_sectors << 9, err);
return 0;
}
电梯调度算法:为了一次尽可能地合并;
执行队列__generic_unplug_device函数:
void __generic_unplug_device(request_queue_t *q)
{
if (unlikely(blk_queue_stopped(q)))
return;
if (!blk_remove_plug(q))
return;
q->request_fn(q); //调用队列的处理函数
}
编写块设备驱动程序:
1.分配gendisk结构体:alloc_disk
2.设置结构体
2.1分配/设置队列:request_queue_t //提供读写能力
blk_init_queue
2.2设置gendisk其他信息 //提供其他属性:容量
3.注册结构体:add_disk
参考文件:
drivers/block/xd.c
drivers/block/Z2ram.c
从驱动入口module_init(xd_init);
xd_init中驱动注册:
module_init(xd_init)的xd_init
static int __init xd_init(void)
{
u_char i,controller;
unsigned int address;
int err;
#ifdef MODULE
{
u_char count = 0;
for (i = 4; i > 0; i--)
if (((xd[i] = xd[i-1]) >= 0) && !count)
count = i;
if ((xd[0] = count))
do_xd_setup(xd);
}
#endif
init_timer (&xd_watchdog_int); xd_watchdog_int.function = xd_watchdog;
if (!xd_dma_buffer)
xd_dma_buffer = (char *)xd_dma_mem_alloc(xd_maxsectors * 0x200);
if (!xd_dma_buffer) {
printk(KERN_ERR "xd: Out of memory.\n");
return -ENOMEM;
}
err = -EBUSY;
if (register_blkdev(XT_DISK_MAJOR, "xd")) //注册一个块设备驱动
goto out1;
err = -ENOMEM;
xd_queue = blk_init_queue(do_xd_request, &xd_lock); //初始化一个队列,队列中传入处理队列函数
if (!xd_queue)
goto out1a;
if (xd_detect(&controller,&address)) {
printk("Detected a%s controller (type %d) at address %06x\n",
xd_sigs[controller].name,controller,address);
if (!request_region(xd_iobase,4,"xd")) {
printk("xd: Ports at 0x%x are not available\n",
xd_iobase);
goto out2;
}
if (controller)
xd_sigs[controller].init_controller(address);
xd_drives = xd_initdrives(xd_sigs[controller].init_drive);
printk("Detected %d hard drive%s (using IRQ%d & DMA%d)\n",
xd_drives,xd_drives == 1 ? "" : "s",xd_irq,xd_dma);
}
err = -ENODEV;
if (!xd_drives)
goto out3;
for (i = 0; i < xd_drives; i++) {
XD_INFO *p = &xd_info[i];
struct gendisk *disk = alloc_disk(64); //分配一个gendisk结构体
if (!disk)
goto Enomem;
p->unit = i;
disk->major = XT_DISK_MAJOR;
disk->first_minor = i<<6;
sprintf(disk->disk_name, "xd%c", i+'a');
disk->fops = &xd_fops;
disk->private_data = p;
disk->queue = xd_queue; //disk结构体的队列等于分配的队列
set_capacity(disk, p->heads * p->cylinders * p->sectors);
printk(" %s: CHS=%d/%d/%d\n", disk->disk_name,
p->cylinders, p->heads, p->sectors);
xd_gendisk[i] = disk;
}
err = -EBUSY;
if (request_irq(xd_irq,xd_interrupt_handler, 0, "XT hard disk", NULL)) {
printk("xd: unable to get IRQ%d\n",xd_irq);
goto out4;
}
if (request_dma(xd_dma,"xd")) {
printk("xd: unable to get DMA%d\n",xd_dma);
goto out5;
}
/* xd_maxsectors depends on controller - so set after detection */
blk_queue_max_sectors(xd_queue, xd_maxsectors);
for (i = 0; i < xd_drives; i++)
add_disk(xd_gendisk[i]); //注册
return 0;
out5:
free_irq(xd_irq, NULL);
out4:
for (i = 0; i < xd_drives; i++)
put_disk(xd_gendisk[i]);
out3:
release_region(xd_iobase,4);
out2:
blk_cleanup_queue(xd_queue);
out1a:
unregister_blkdev(XT_DISK_MAJOR, "xd");
out1:
if (xd_dma_buffer)
xd_dma_mem_free((unsigned long)xd_dma_buffer,
xd_maxsectors * 0x200);
return err;
Enomem:
err = -ENOMEM;
while (i--)
put_disk(xd_gendisk[i]);
goto out3;
}
编写块驱动程序基本框架:
内存模拟块设备
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/mm.h>
#include <linux/fs.h>
#include <linux/kernel.h>
#include <linux/timer.h>
#include <linux/genhd.h>
#include <linux/hdreg.h>
#include <linux/ioport.h>
#include <linux/init.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/delay.h>
#include <linux/io.h>
#include <asm/system.h>
#include <asm/uaccess.h>
#include <asm/dma.h>
static struct gendisk *ramblock_disk; //gendisk结构体
static request_queue_t *ramblock_queue; //分配设置队列返回的值
static int major; //主设备号,可自己确定也可以由系统自动分配
static DEFINE_SPINLOCK(ramblock_lock); //自旋锁
#define RAMBLOCK_SIZE (1024*1024) //容量的大小
static unsigned char *ramblock_buf; //硬件操作缓冲区
static int ramblock_getgeo(struct block_device *bdev, struct hd_geometry *geo)
{ /*容量 = heads*cylinders*sectors*512*/
geo->heads = 2; //磁头数
geo->cylinders = 32; //柱面
geo->sectors = RAMBLOCK_SIZE/2/32/512; //扇区数
return 0;
}
static struct block_device_operations ramblock_fops = {
.owner = THIS_MODULE,
.getgeo = ramblock_getgeo, //为了实现使用fdisk进行分区
};
static void do_ramblock_request(request_queue_t *q){ //处理队列函数
static int cnt = 0;
struct request *req;
printk("do_ramblock_request\n");
while ((req = elv_next_request(q)) != NULL) { //以电梯调度算法取出队列中的请求
/*数据进行传输三要素:源,目的,长度*/
/*源、目的*/
unsigned long offset = req->sector*512; //偏移值
/*目的,源*/
//req->buffer;
/*长度*/
unsigned long len = req->current_nr_sectors *512; //长度
if(rq_data_dir(req) == READ){ //如果是读函数
memcpy(req->buffer,ramblock_buf+offset,len); //从磁盘中读到buffer
}else{
memcpy(ramblock_buf+offset,req->buffer,len); //写函数,从buffer中写到磁盘
}
end_request(req, 1); /* 0 = fail, 1 = success */
}
}
//入口函数
static int ramblock_init(void){
/*1.分配一个gendisk结构体*/
ramblock_disk = alloc_disk(16); //次设备号个数:分区个数+1,写16最多只能创建15个分区
/*2.设置*/
/*2.1 分配/设置一个队列 提供读写能力*/
ramblock_queue = blk_init_queue(do_ramblock_request,&ramblock_lock); //do_ramblock__request处理队列函数,ramblock_lock自旋锁
ramblock_disk->queue = ramblock_queue; //设置队列
/*2.2 设置其他属性,比如容量*/
major= register_blkdev(0,"ramblock"); //注册一个主设备号,与字符设备驱动相比少了fileoperation
ramblock_disk->major = major; //主设备号
ramblock_disk->first_minor = 0; //第一个次设备号
sprintf(ramblock_disk->disk_name, "ramblock"); //名字
ramblock_disk->fops = &ramblock_fops; //一个空的操作函数
set_capacity(ramblock_disk, RAMBLOCK_SIZE/512); //设置扇区容量,内核中认为扇区是512字节
/*3.硬件相关操作*/
ramblock_buf = kzalloc(RAMBLOCK_SIZE,GFP_KERNEL);
/*4.注册*/
add_disk(ramblock_disk);
return 0;
}
//出口函数
static void ramblock_exit(void){
unregister_blkdev(major,"ramblock"); //卸载设备
del_gendisk(ramblock_disk); //销毁gendisk
put_disk(ramblock_disk);
blk_cleanup_queue(ramblock_queue); //清除队列
kfree(ramblock_buf); //释放缓冲区
}
module_init(ramblock_init);
module_exit(ramblock_exit);
MODULE_LICENSE("GPL");
修改Makefil,将驱动文件以模块的方式编译进内核;
测试步骤:
1.insmod ramblock.ko
2.ls dev/ramblock*
3.fdisk /dev/ramblock
4.格式化:mkdosfs /dev/ramblock
3.挂接:mount /dev/ramblock /temp/ 挂接到temp目录中去
4.读写文件:cd /temp 在里面读写文件
5.umount /temp
6.cat /dev/ramblock > /mnt/ramblock.bin将整个磁盘映像拷贝到/mnt/ramblock.bin
7.在pc上查看ramblock.bin sudo mount -o loop ramblock.bin /mnt (-o loop 将普通设备当做块设备进行挂接)