io hang linux 原理,一种在块设备层模拟io hang的方法

前言

io hang对于数据库/存储系统而言是致命的,因此,如何模拟一个较为真实的io hang环境,并对自己的系统代码进行测试显得尤为重要。io hang的模拟根据模拟的层次可以有很多方法,比较简单的有使用LD_PRELOAD对用户态函数进行偷梁换柱、或者使用Fuse实现用户态文件系统、稍微深一点的有使用systemtap工具hook内核系统调用,还有一些其他的内核错误注入工具等。这些io hang模拟方法虽然层次不同,但是它们其实并没有模拟到最底层,也就是不能模拟完整的io路径。通常,我们所谓的io hang,都是指最底层的物理设备(磁盘)出现问题,比如坏块过多导致整理gc延迟过大、或者磁盘直接出现硬件故障无法正常读写数据等。可以看到,我们需要模拟这种块设备的io hang,才能最接近真实的io hang场景。

那么如何才能对一个块设备进行模拟,其实,模拟一个磁盘还是很简单的,就是写一个简单的块设备驱动就好了(参考LDD3中subll的例子很简单)。当然,如果懒得写,在linux内核中,我们也可以在源码/driver/block目录下找到一个块设备驱动,如果用过linux下的ramdisk设备(通常为/dev/ram0、/dev/ram1等等),那么就会对这个代码很熟悉。使用ramdisk类似的方式,不但可以完整的产生一个块设备,和真实磁盘相比,还不用关心具体设备协议(如nvme、ahci等),可以说非常方便了。

这里直接使用ramdisk驱动,稍作修改。

块设备驱动

首先,在内核中找到你机器上对应内核版本的代码,比如:https://elixir.bootlin.com/linux/v3.10/source/drivers/block/brd.c ,然后在文件中DECLARE_WAIT_QUEUE_HEAD(io_hang_q)声明一个等待队列,因为我们主要测试write/read hang,因此这里我们直接在copy_to_brd(表示从用户态拷贝数据到ramdisk)和copy_from_brd(表示从ramdisk读数据到用户态)两个函数中分别加入如下代码,表示如果io_hang不为0,则在TASK_UNINTERRUPTIBLE状态睡眠io_hang秒(或者io_hang被修改为0)。

if(io_hang){

pr_info("Simulation of IO hang for read ,will hang %d seconds\n",io_hang);

wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));

pr_info("Simulation of IO hang for read finish\n");

}

if(io_hang){

pr_info("Simulation of IO hang for read ,will hang %d seconds\n",io_hang);

wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));

pr_info("Simulation of IO hang for read finish\n");

}

同时,我们需要一个参数来控制ramdisk的磁盘行为(它一开始不能hang,因为我们需要格式化文件系统,写superblock等),这里只要加一个开关控制:

module_param(io_hang,int,S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

MODULE_PARM_DESC(io_hang,"io hang switch");

这里注意参数的权限设置,必须要能在用户态可以修改,后续我们将通过用户态sysfs对这个参数进行修改。

这些改动做完之后,写一个简单的Makefile将其编译为内核模块(注:为了简明我把brd.c改为ramdisk.c):

obj-m := ramdisk.o

SRC = /lib/modules/$(shell uname -r)/build

ko:

make -C $(SRC) M=$(PWD) modules

clean:

make -C $(SRC) M=$(PWD) clean

然后直接make即可生成ramdisk.ko。

ramdisk.c代码如下:

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#include

#ifdef CONFIG_BLK_DEV_RAM_DAX

#include

#endif

#include

#define SECTOR_SHIFT9

#define PAGE_SECTORS_SHIFT(PAGE_SHIFT - SECTOR_SHIFT)

#define PAGE_SECTORS(1 << PAGE_SECTORS_SHIFT)

DECLARE_WAIT_QUEUE_HEAD(io_hang_q);

static int io_hang = 0;

/*

* Each block ramdisk device has a radix_tree brd_pages of pages that stores

* the pages containing the block device's contents. A brd page's ->index is

* its offset in PAGE_SIZE units. This is similar to, but in no way connected

* with, the kernel's pagecache or buffer cache (which sit above our block

* device).

*/

struct brd_device {

intbrd_number;

struct request_queue*brd_queue;

struct gendisk*brd_disk;

struct list_headbrd_list;

/*

* Backing store of pages and lock to protect it. This is the contents

* of the block device.

*/

spinlock_tbrd_lock;

struct radix_tree_rootbrd_pages;

};

/*

* Look up and return a brd's page for a given sector.

*/

static DEFINE_MUTEX(brd_mutex);

static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)

{

pgoff_t idx;

struct page *page;

/*

* The page lifetime is protected by the fact that we have opened the

* device node -- brd pages will never be deleted under us, so we

* don't need any further locking or refcounting.

*

* This is strictly true for the radix-tree nodes as well (ie. we

* don't actually need the rcu_read_lock()), however that is not a

* documented feature of the radix-tree API so it is better to be

* safe here (we don't have total exclusion from radix tree updates

* here, only deletes).

*/

rcu_read_lock();

idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */

page = radix_tree_lookup(&brd->brd_pages, idx);

rcu_read_unlock();

BUG_ON(page && page->index != idx);

return page;

}

/*

* Look up and return a brd's page for a given sector.

* If one does not exist, allocate an empty page, and insert that. Then

* return it.

*/

static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)

{

pgoff_t idx;

struct page *page;

gfp_t gfp_flags;

page = brd_lookup_page(brd, sector);

if (page)

return page;

/*

* Must use NOIO because we don't want to recurse back into the

* block or filesystem layers from page reclaim.

*

* Cannot support DAX and highmem, because our ->direct_access

* routine for DAX must return memory that is always addressable.

* If DAX was reworked to use pfns and kmap throughout, this

* restriction might be able to be lifted.

*/

gfp_flags = GFP_NOIO | __GFP_ZERO;

#ifndef CONFIG_BLK_DEV_RAM_DAX

gfp_flags |= __GFP_HIGHMEM;

#endif

page = alloc_page(gfp_flags);

if (!page)

return NULL;

if (radix_tree_preload(GFP_NOIO)) {

__free_page(page);

return NULL;

}

spin_lock(&brd->brd_lock);

idx = sector >> PAGE_SECTORS_SHIFT;

page->index = idx;

if (radix_tree_insert(&brd->brd_pages, idx, page)) {

__free_page(page);

page = radix_tree_lookup(&brd->brd_pages, idx);

BUG_ON(!page);

BUG_ON(page->index != idx);

}

spin_unlock(&brd->brd_lock);

radix_tree_preload_end();

return page;

}

static void brd_free_page(struct brd_device *brd, sector_t sector)

{

struct page *page;

pgoff_t idx;

spin_lock(&brd->brd_lock);

idx = sector >> PAGE_SECTORS_SHIFT;

page = radix_tree_delete(&brd->brd_pages, idx);

spin_unlock(&brd->brd_lock);

if (page)

__free_page(page);

}

static void brd_zero_page(struct brd_device *brd, sector_t sector)

{

struct page *page;

page = brd_lookup_page(brd, sector);

if (page)

clear_highpage(page);

}

/*

* Free all backing store pages and radix tree. This must only be called when

* there are no other users of the device.

*/

#define FREE_BATCH 16

static void brd_free_pages(struct brd_device *brd)

{

unsigned long pos = 0;

struct page *pages[FREE_BATCH];

int nr_pages;

do {

int i;

nr_pages = radix_tree_gang_lookup(&brd->brd_pages,

(void **)pages, pos, FREE_BATCH);

for (i = 0; i < nr_pages; i++) {

void *ret;

BUG_ON(pages[i]->index < pos);

pos = pages[i]->index;

ret = radix_tree_delete(&brd->brd_pages, pos);

BUG_ON(!ret || ret != pages[i]);

__free_page(pages[i]);

}

pos++;

/*

* This assumes radix_tree_gang_lookup always returns as

* many pages as possible. If the radix-tree code changes,

* so will this have to.

*/

} while (nr_pages == FREE_BATCH);

}

/*

* copy_to_brd_setup must be called before copy_to_brd. It may sleep.

*/

static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)

{

unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;

size_t copy;

copy = min_t(size_t, n, PAGE_SIZE - offset);

if (!brd_insert_page(brd, sector))

return -ENOSPC;

if (copy < n) {

sector += copy >> SECTOR_SHIFT;

if (!brd_insert_page(brd, sector))

return -ENOSPC;

}

return 0;

}

static void discard_from_brd(struct brd_device *brd,

sector_t sector, size_t n)

{

while (n >= PAGE_SIZE) {

/*

* Don't want to actually discard pages here because

* re-allocating the pages can result in writeback

* deadlocks under heavy load.

*/

if (0)

brd_free_page(brd, sector);

else

brd_zero_page(brd, sector);

sector += PAGE_SIZE >> SECTOR_SHIFT;

n -= PAGE_SIZE;

}

}

/*

* Copy n bytes from src to the brd starting at sector. Does not sleep.

*/

static void copy_to_brd(struct brd_device *brd, const void *src,

sector_t sector, size_t n)

{

struct page *page;

void *dst;

unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;

size_t copy;

/**

* Simulation of IO hang

*

* per 1 second

*/

if(io_hang){

pr_info("Simulation of IO hang for write,will hang %d seconds\n",io_hang);

wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));

pr_info("Simulation of IO hang for write finish\n");

}

copy = min_t(size_t, n, PAGE_SIZE - offset);

page = brd_lookup_page(brd, sector);

BUG_ON(!page);

dst = kmap_atomic(page);

memcpy(dst + offset, src, copy);

kunmap_atomic(dst);

if (copy < n) {

src += copy;

sector += copy >> SECTOR_SHIFT;

copy = n - copy;

page = brd_lookup_page(brd, sector);

BUG_ON(!page);

dst = kmap_atomic(page);

memcpy(dst, src, copy);

kunmap_atomic(dst);

}

}

/*

* Copy n bytes to dst from the brd starting at sector. Does not sleep.

*/

static void copy_from_brd(void *dst, struct brd_device *brd,

sector_t sector, size_t n)

{

struct page *page;

void *src;

unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;

size_t copy;

/**

* Simulation of IO hang

*

* per 1 second

*/

if(io_hang){

pr_info("Simulation of IO hang for read ,will hang %d seconds\n",io_hang);

wait_event_timeout(io_hang_q, io_hang == 0,msecs_to_jiffies(io_hang * 1000));

pr_info("Simulation of IO hang for read finish\n");

}

copy = min_t(size_t, n, PAGE_SIZE - offset);

page = brd_lookup_page(brd, sector);

if (page) {

src = kmap_atomic(page);

memcpy(dst, src + offset, copy);

kunmap_atomic(src);

} else

memset(dst, 0, copy);

if (copy < n) {

dst += copy;

sector += copy >> SECTOR_SHIFT;

copy = n - copy;

page = brd_lookup_page(brd, sector);

if (page) {

src = kmap_atomic(page);

memcpy(dst, src, copy);

kunmap_atomic(src);

} else

memset(dst, 0, copy);

}

}

/*

* Process a single bvec of a bio.

*/

static int brd_do_bvec(struct brd_device *brd, struct page *page,

unsigned int len, unsigned int off, bool is_write,

sector_t sector)

{

void *mem;

int err = 0;

if (is_write) {

err = copy_to_brd_setup(brd, sector, len);

if (err)

goto out;

}

mem = kmap_atomic(page);

if (!is_write) {

copy_from_brd(mem + off, brd, sector, len);

flush_dcache_page(page);

} else {

flush_dcache_page(page);

copy_to_brd(brd, mem + off, sector, len);

}

kunmap_atomic(mem);

out:

return err;

}

static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio)

{

struct block_device *bdev = bio->bi_bdev;

struct brd_device *brd = bdev->bd_disk->private_data;

struct bio_vec bvec;

sector_t sector;

struct bvec_iter iter;

sector = bio->bi_iter.bi_sector;

if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))

goto io_error;

if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {

if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) ||

bio->bi_iter.bi_size & ~PAGE_MASK)

goto io_error;

discard_from_brd(brd, sector, bio->bi_iter.bi_size);

goto out;

}

bio_for_each_segment(bvec, bio, iter) {

unsigned int len = bvec.bv_len;

int err;

err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,

op_is_write(bio_op(bio)), sector);

if (err)

goto io_error;

sector += len >> SECTOR_SHIFT;

}

out:

bio_endio(bio);

return BLK_QC_T_NONE;

io_error:

bio_io_error(bio);

return BLK_QC_T_NONE;

}

static int brd_rw_page(struct block_device *bdev, sector_t sector,

struct page *page, bool is_write)

{

struct brd_device *brd = bdev->bd_disk->private_data;

int err = brd_do_bvec(brd, page, PAGE_SIZE, 0, is_write, sector);

page_endio(page, is_write, err);

return err;

}

#ifdef CONFIG_BLK_DEV_RAM_DAX

static long brd_direct_access(struct block_device *bdev, sector_t sector,

void **kaddr, pfn_t *pfn, long size)

{

struct brd_device *brd = bdev->bd_disk->private_data;

struct page *page;

if (!brd)

return -ENODEV;

page = brd_insert_page(brd, sector);

if (!page)

return -ENOSPC;

*kaddr = page_address(page);

*pfn = page_to_pfn_t(page);

return PAGE_SIZE;

}

#else

#define brd_direct_access NULL

#endif

static int brd_ioctl(struct block_device *bdev, fmode_t mode,

unsigned int cmd, unsigned long arg)

{

int error;

struct brd_device *brd = bdev->bd_disk->private_data;

if (cmd != BLKFLSBUF)

return -ENOTTY;

/*

* ram device BLKFLSBUF has special semantics, we want to actually

* release and destroy the ramdisk data.

*/

mutex_lock(&brd_mutex);

mutex_lock(&bdev->bd_mutex);

error = -EBUSY;

if (bdev->bd_openers <= 1) {

/*

* Kill the cache first, so it isn't written back to the

* device.

*

* Another thread might instantiate more buffercache here,

* but there is not much we can do to close that race.

*/

kill_bdev(bdev);

brd_free_pages(brd);

error = 0;

}

mutex_unlock(&bdev->bd_mutex);

mutex_unlock(&brd_mutex);

return error;

}

static const struct block_device_operations brd_fops = {

.owner =THIS_MODULE,

.rw_page =brd_rw_page,

.ioctl =brd_ioctl,

.direct_access =brd_direct_access,

};

/*

* And now the modules code and kernel interface.

*/

static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;

module_param(rd_nr, int, S_IRUGO);

MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");

int rd_size = CONFIG_BLK_DEV_RAM_SIZE;

module_param(rd_size, int, S_IRUGO);

MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");

static int max_part = 1;

module_param(max_part, int, S_IRUGO);

MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");

module_param(io_hang,int,S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

MODULE_PARM_DESC(io_hang,"io hang switch");

MODULE_LICENSE("GPL");

MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);

MODULE_ALIAS("rd");

#ifndef MODULE

/* Legacy boot options - nonmodular */

static int __init ramdisk_size(char *str)

{

rd_size = simple_strtol(str, NULL, 0);

return 1;

}

__setup("ramdisk_size=", ramdisk_size);

#endif

/*

* The device scheme is derived from loop.c. Keep them in synch where possible

* (should share code eventually).

*/

static LIST_HEAD(brd_devices);

static DEFINE_MUTEX(brd_devices_mutex);

static struct brd_device *brd_alloc(int i)

{

struct brd_device *brd;

struct gendisk *disk;

brd = kzalloc(sizeof(*brd), GFP_KERNEL);

if (!brd)

goto out;

brd->brd_number= i;

spin_lock_init(&brd->brd_lock);

INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);

brd->brd_queue = blk_alloc_queue(GFP_KERNEL);

if (!brd->brd_queue)

goto out_free_dev;

blk_queue_make_request(brd->brd_queue, brd_make_request);

blk_queue_max_hw_sectors(brd->brd_queue, 1024);

blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);

/* This is so fdisk will align partitions on 4k, because of

* direct_access API needing 4k alignment, returning a PFN

* (This is only a problem on very small devices <= 4M,

* otherwise fdisk will align on 1M. Regardless this call

* is harmless)

*/

blk_queue_physical_block_size(brd->brd_queue, PAGE_SIZE);

brd->brd_queue->limits.discard_granularity = PAGE_SIZE;

blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX);

brd->brd_queue->limits.discard_zeroes_data = 1;

queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);

#ifdef CONFIG_BLK_DEV_RAM_DAX

queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);

#endif

disk = brd->brd_disk = alloc_disk(max_part);

if (!disk)

goto out_free_queue;

disk->major= RAMDISK_MAJOR;

disk->first_minor= i * max_part;

disk->fops= &brd_fops;

disk->private_data= brd;

disk->queue= brd->brd_queue;

disk->flags= GENHD_FL_EXT_DEVT;

sprintf(disk->disk_name, "ramdisk%d", i);

set_capacity(disk, rd_size * 2);

return brd;

out_free_queue:

blk_cleanup_queue(brd->brd_queue);

out_free_dev:

kfree(brd);

out:

return NULL;

}

static void brd_free(struct brd_device *brd)

{

put_disk(brd->brd_disk);

blk_cleanup_queue(brd->brd_queue);

brd_free_pages(brd);

kfree(brd);

}

static struct brd_device *brd_init_one(int i, bool *new)

{

struct brd_device *brd;

*new = false;

list_for_each_entry(brd, &brd_devices, brd_list) {

if (brd->brd_number == i)

goto out;

}

brd = brd_alloc(i);

if (brd) {

add_disk(brd->brd_disk);

list_add_tail(&brd->brd_list, &brd_devices);

}

*new = true;

out:

return brd;

}

static void brd_del_one(struct brd_device *brd)

{

list_del(&brd->brd_list);

del_gendisk(brd->brd_disk);

brd_free(brd);

}

static struct kobject *brd_probe(dev_t dev, int *part, void *data)

{

struct brd_device *brd;

struct kobject *kobj;

bool new;

mutex_lock(&brd_devices_mutex);

brd = brd_init_one(MINOR(dev) / max_part, &new);

kobj = brd ? get_disk(brd->brd_disk) : NULL;

mutex_unlock(&brd_devices_mutex);

if (new)

*part = 0;

return kobj;

}

static int __init brd_init(void)

{

struct brd_device *brd, *next;

int i;

/*

* brd module now has a feature to instantiate underlying device

* structure on-demand, provided that there is an access dev node.

*

* (1) if rd_nr is specified, create that many upfront. else

* it defaults to CONFIG_BLK_DEV_RAM_COUNT

* (2) User can further extend brd devices by create dev node themselves

* and have kernel automatically instantiate actual device

* on-demand. Example:

*mknod /path/devnod_name b 1 X# 1 is the rd major

*fdisk -l /path/devnod_name

*If (X / max_part) was not already created it will be created

*dynamically.

*/

if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))

return -EIO;

if (unlikely(!max_part))

max_part = 1;

for (i = 0; i < rd_nr; i++) {

brd = brd_alloc(i);

if (!brd)

goto out_free;

list_add_tail(&brd->brd_list, &brd_devices);

}

/* point of no return */

list_for_each_entry(brd, &brd_devices, brd_list)

add_disk(brd->brd_disk);

blk_register_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS,

THIS_MODULE, brd_probe, NULL, NULL);

pr_info("brd: module loaded\n");

return 0;

out_free:

list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {

list_del(&brd->brd_list);

brd_free(brd);

}

unregister_blkdev(RAMDISK_MAJOR, "ramdisk");

pr_info("brd: module NOT loaded !!!\n");

return -ENOMEM;

}

static void __exit brd_exit(void)

{

struct brd_device *brd, *next;

list_for_each_entry_safe(brd, next, &brd_devices, brd_list)

brd_del_one(brd);

blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), 1UL << MINORBITS);

unregister_blkdev(RAMDISK_MAJOR, "ramdisk");

pr_info("brd: module unloaded\n");

}

module_init(brd_init);

module_exit(brd_exit);

格式化、挂载

上面我们实现了一个块设备驱动,并编译出来一个ramdisk.ko,现在可以注册这个块设备驱动:

sudo insmod ramdisk.ko

d0b540903ba5e2e702da9273be947e38.png

可以看到/dev下多出来很多ramdisk设备,这些都是块设备,你可以把它当做一个真实的磁盘,只是这些磁盘都是memory backend。

这里我使用ramdisk0做测试,首先对其格式化:

mkfs -t ext4 /dev/ramdisk0

然后将其挂载到一个目录(选一个测试目录,比如io_hang_test)

sudo mount -t ext4 /dev/ramdisk0 /mnt/io_hang_test

挂载之后可以运行mount命令看下挂载效果(最后一行)。

cf7d29f8f00a5d1b9cfd29bbd116d43b.png

控制

好了,现在/mnt/io_hang_test目录对应的就是我们的ramdisk0,所有在/mnt/io_hang_test目录下进行的io操作,都会经过ext4文件系统后转为对ramdisk0的读写操作,可以直接在上面创建一个文件试一试,只是这些文件是在内存里,重启后就丢失了。

如果前面一切正常,则可以开始模拟io hang。还记得之前那个内核参数io_hang,它目前还是0,因为不会io hang,现在来修改它:

sudo bash -c " echo 10 > /sys/module/ramdisk/parameters/io_hang"

这条命令会修改内核参数io_hang,将其值改为10,这样的话下一次读写的时候,ramdisk驱动会对每一次的read/write hang 10 秒钟(不精确)。如果想关闭io hang,执行:

sudo bash -c " echo 0 > /sys/module/ramdisk/parameters/io_hang"

测试

为了测试read和write,我这里写一个简单的测试代码,测试read和write,同时,为了避开page cache的影响和使用同步io,使用O_DIRECT和O_SYNC方式open。

#define _GNU_SOURCE

#include

#include

#include

#include

#include

#include

#include

#include

#define BUF_SIZE 1024

int main(){

int fd,size,len;

int ret = 0;

char *write_buf,*read_buf;

ret = posix_memalign((void **)&write_buf, 512, BUF_SIZE);

if (ret) {

perror("posix_memalign:");

exit(1);

}

ret = posix_memalign((void **)&read_buf, 512, BUF_SIZE);

if (ret) {

perror("posix_memalign:");

exit(1);

}

strcpy(write_buf,"direct osync mode");

len = strlen("direct osync mode");

if ((fd = open("hello.c", O_CREAT | O_TRUNC | O_RDWR | O_DIRECT | O_SYNC,0666 ))<0) {

perror("open:");

exit(1);

}

if ((size = write(fd, write_buf, BUF_SIZE)) < 0){

perror("write:");

exit(1);

}

lseek(fd, 0, SEEK_SET );

if ((size = read( fd, read_buf, BUF_SIZE))<0) {

perror("read:");

exit(1);

}

if(strncmp(read_buf,write_buf,len) != 0){

perror("strncmp:");

exit(1);

}

if (close(fd) < 0 ) {

perror("close:");

exit(1);

}

}

假设测试代码为test.c,则为了测试方便可以写一个简单的shell:

#!/bin/sh

mkdir io_hang_test

gcc -o test test.c

sudo umount io_hang_test

sudo rmmod ramdisk.ko

make

sudo insmod ramdisk.ko

sudo mkfs -t ext4 /dev/ramdisk0

sudo mount -t ext4 /dev/ramdisk0 io_hang_test

sudo cp test io_hang_test/

然后在io_hang_test目录下sudo ./test,然后看test进程处于D状态。

c9d08c70b6c07f7bd9fa114334d1d188.png

同时sudo tail -f  /var/log/kern.log看下内核的持续输出。

02c2894ca4adacc1bc8be00e948c6626.png

总结

本文只是提供了一种较为简单、在块设备层模拟io hang的方法。好处是不依赖真实磁盘设备、不会影响系统真实磁盘的读写(磁盘大小可自定义),只有处于测试挂载点的读写才会hang,并且umount之后自动清理测试文件,没有垃圾。在此基础上,还可以定制驱动扩展自己的io hang策略(本文只阐述了一种简单的延时hang策略),控制性很强。

这里之所以不使用自己写块设备驱动,是因为内核版本很多,这样在调试不同内核版本时需要移植,如果直接使用ramdisk,那么内核中不同版本下都直接有相应的源码(3.x和5.x差别还是很大的),稍微加一下自己的代码就可以编译使用。非常方便。

如果非要自己写一个简单一点的ramdisk,也可以参考下面的代码(参考LDD,修改支持blk_mq):

#include

#include

#include

#include

#include /* printk() */

#include /* kmalloc() */

#include /* everything... */

#include /* error codes */

#include /* size_t */

#include /* O_ACCMODE */

#include /* HDIO_GETGEO */

#include

#include

#include

#include

#include

#include

#ifndef BLK_STS_OK

typedef int blk_status_t;

#define BLK_STS_OK 0

#define OLDER_KERNEL 1

#endif

#ifndef BLK_STS_IOERR

#define BLK_STS_IOERR 10

#endif

#ifndef SECTOR_SHIFT

#define SECTOR_SHIFT 9

#endif

#define size_to_sectors(size) ((size) >> SECTOR_SHIFT)

#define sectors_to_size(size) ((size) << SECTOR_SHIFT)

MODULE_LICENSE("Dual BSD/GPL");

static int sbull_major;

module_param(sbull_major, int, 0);

static int logical_block_size = 512;

module_param(logical_block_size, int, 0);

static char* disk_size = "256M";

module_param(disk_size, charp, 0);

static int ndevices = 1;

module_param(ndevices, int, 0);

static bool debug = false;

module_param(debug, bool, false);

/*

* The different "request modes" we can use.

*/

enum {

RM_SIMPLE = 0,/* The extra-simple request function */

RM_FULL = 1,/* The full-blown version */

RM_NOQUEUE = 2,/* Use make_request */

};

/*

* Minor number and partition management.

*/

#define SBULL_MINORS16

/*

* We can tweak our hardware sector size, but the kernel talks to us

* in terms of small sectors, always.

*/

#define KERNEL_SECTOR_SIZE512

/*

* The internal representation of our device.

*/

struct sbull_dev {

int size; /* Device size in sectors */

u8 *data; /* The data array */

spinlock_t lock; /* For mutual exclusion */

struct request_queue *queue; /* The device request queue */

struct gendisk *gd; /* The gendisk structure */

struct blk_mq_tag_set tag_set;

};

static struct sbull_dev *Devices;

/* Handle an I/O request */

static blk_status_t sbull_transfer(struct sbull_dev *dev, unsigned long sector,

unsigned long nsect, char *buffer, int op)

{

unsigned long offset = sectors_to_size(sector);

unsigned long nbytes = sectors_to_size(nsect);

if ((offset + nbytes) > dev->size) {

pr_notice("Beyond-end write (%ld %ld)\n", offset, nbytes);

return BLK_STS_IOERR;

}

if (debug)

pr_info("%s: %s, sector: %ld, nsectors: %ld, offset: %ld,"

" nbytes: %ld",

dev->gd->disk_name,

op == REQ_OP_WRITE ? "WRITE" : "READ", sector, nsect,

offset, nbytes);

/* will be only REQ_OP_READ or REQ_OP_WRITE */

if (op == REQ_OP_WRITE)

memcpy(dev->data + offset, buffer, nbytes);

else

memcpy(buffer, dev->data + offset, nbytes);

return BLK_STS_OK;

}

static blk_status_t sbull_queue_rq(struct blk_mq_hw_ctx *hctx,

const struct blk_mq_queue_data *bd)

{

struct request *req = bd->rq;

struct sbull_dev *dev = req->rq_disk->private_data;

int op = req_op(req);

blk_status_t ret;

blk_mq_start_request(req);

spin_lock(&dev->lock);

if (op != REQ_OP_READ && op != REQ_OP_WRITE) {

pr_notice("Skip non-fs request\n");

blk_mq_end_request(req, BLK_STS_IOERR);

spin_unlock(&dev->lock);

return BLK_STS_IOERR;

}

ret = sbull_transfer(dev, blk_rq_pos(req),

blk_rq_cur_sectors(req),

bio_data(req->bio), op);

blk_mq_end_request(req, ret);

spin_unlock(&dev->lock);

return ret;

}

/*

* The device operations structure.

*/

static const struct block_device_operations sbull_ops = {

.owner= THIS_MODULE,

};

static const struct blk_mq_ops sbull_mq_ops = {

.queue_rq = sbull_queue_rq,

};

static struct request_queue *create_req_queue(struct blk_mq_tag_set *set)

{

struct request_queue *q;

#ifndef OLDER_KERNEL

q = blk_mq_init_sq_queue(set, &sbull_mq_ops,

2, BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);

#else

int ret;

memset(set, 0, sizeof(*set));

set->ops = &sbull_mq_ops;

set->nr_hw_queues = 1;

/*set->nr_maps = 1;*/

set->queue_depth = 2;

set->numa_node = NUMA_NO_NODE;

set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;

ret = blk_mq_alloc_tag_set(set);

if (ret)

return ERR_PTR(ret);

q = blk_mq_init_queue(set);

if (IS_ERR(q)) {

blk_mq_free_tag_set(set);

return q;

}

#endif

return q;

}

/*

* Set up our internal device.

*/

static void setup_device(struct sbull_dev *dev, int which)

{

long long sbull_size = memparse(disk_size, NULL);

memset(dev, 0, sizeof(struct sbull_dev));

dev->size = sbull_size;

dev->data = vzalloc(dev->size);

if (dev->data == NULL) {

pr_notice("vmalloc failure.\n");

return;

}

spin_lock_init(&dev->lock);

dev->queue = create_req_queue(&dev->tag_set);

if (IS_ERR(dev->queue))

goto out_vfree;

blk_queue_logical_block_size(dev->queue, logical_block_size);

dev->queue->queuedata = dev;

/*

* And the gendisk structure.

*/

dev->gd = alloc_disk(SBULL_MINORS);

if (!dev->gd) {

pr_notice("alloc_disk failure\n");

goto out_vfree;

}

dev->gd->major = sbull_major;

dev->gd->first_minor = which*SBULL_MINORS;

dev->gd->fops = &sbull_ops;

dev->gd->queue = dev->queue;

dev->gd->private_data = dev;

snprintf(dev->gd->disk_name, 32, "myramdisk%c", which + 'a');

set_capacity(dev->gd, size_to_sectors(sbull_size));

add_disk(dev->gd);

return;

out_vfree:

if (dev->data)

vfree(dev->data);

}

static int __init sbull_init(void)

{

int i;

/*

* Get registered.

*/

sbull_major = register_blkdev(sbull_major, "myramdisk");

if (sbull_major <= 0) {

pr_warn("myramdisk: unable to get major number\n");

return -EBUSY;

}

/*

* Allocate the device array, and initialize each one.

*/

Devices = kmalloc(ndevices * sizeof(struct sbull_dev), GFP_KERNEL);

if (Devices == NULL)

goto out_unregister;

for (i = 0; i < ndevices; i++)

setup_device(Devices + i, i);

return 0;

out_unregister:

unregister_blkdev(sbull_major, "myramdisk");

return -ENOMEM;

}

static void sbull_exit(void)

{

int i;

for (i = 0; i < ndevices; i++) {

struct sbull_dev *dev = Devices + i;

if (dev->gd) {

del_gendisk(dev->gd);

put_disk(dev->gd);

}

if (dev->queue)

blk_cleanup_queue(dev->queue);

if (dev->data)

vfree(dev->data);

}

unregister_blkdev(sbull_major, "myramdisk");

kfree(Devices);

}

module_init(sbull_init);

module_exit(sbull_exit);

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值