前段时间看了Linux Block Layber的相关代码,主要看了Linux最简单的IO调度器NOOP的实现。

接下来总结下Linux BLOCK层work flow的机制。


继续把SCSI 探测设备这一部分的内容添加进来。

Block driver分析:

从makefile入手

#
# Makefile for the kernel block layer
#

obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
                        blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
                        blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
                        blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \
                        blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
                        genhd.o scsi_ioctl.o partition-generic.o partitions/

obj-$(CONFIG_BLK_DEV_BSG)       += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB)    += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP)        += blk-cgroup.o
obj-$(CONFIG_BLK_DEV_THROTTLING)        += blk-throttle.o
obj-$(CONFIG_IOSCHED_NOOP)      += noop-iosched.o
obj-$(CONFIG_IOSCHED_DEADLINE)  += deadline-iosched.o
obj-$(CONFIG_IOSCHED_CFQ)       += cfq-iosched.o

obj-$(CONFIG_BLOCK_COMPAT)      += compat_ioctl.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o

genhd.c genhd.h 开始 gendisk handing

定义了一些通用块设备的属性。通过sysfs可以查看如discard_alignment,size等。

static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);

static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);

static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);

static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);

static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);

static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);

static DEVICE_ATTR(discard_alignment, S_IRUGO, disk_discard_alignment_show,

  NULL);

static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);

static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);

static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);



//disk_get_part - get partition
struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
//disk_map_sector_rcu - map sector to partition
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
//add_disk - add partitioning information to kernel list
void add_disk(struct gendisk *disk)
struct gendisk *alloc_disk(int minors);
extern struct gendisk *get_gendisk(dev_t dev, int *partno);
extern struct block_device *bdget_disk(struct gendisk *disk, int partno);

#define dev_to_disk(device)	container_of((device), struct gendisk, part0.__dev)
#define dev_to_part(device)	container_of((device), struct hd_struct, __dev)
#define disk_to_dev(disk)	(&(disk)->part0.__dev)
#define part_to_dev(part)	(&((part)->__dev))
实现了从struct device、struct gendisk、struct hd_struct之间结构体互相获取的函数。


//register_blkdev - register a new block device
int register_blkdev(unsigned int major, const char *name)
把主设备号存入到一个全局的数组中去,以及驱动name也放进去。major最大是255,如果大于255,就对255取余。
static struct blk_major_name {
	struct blk_major_name *next;
	int major;
	char name[16];
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
/**
 *	init_sd - entry point for this driver (both when built in or when
 *	a module).
 *
 *	Note: this function registers this driver with the scsi mid-level.
 **/
static int __init init_sd(void)
{
	int majors = 0, i, err;
	for (i = 0; i < SD_MAJORS; i++)
		if (register_blkdev(sd_major(i), "sd") == 0)
			majors++;
	err = class_register(&sd_disk_class);
	向scsi总线注册"sd" device_driver;
	err = scsi_register_driver(&sd_template.gendrv);

	return 0;

}

int scsi_register_driver(struct device_driver *drv)
{
	drv->bus = &scsi_bus_type;

	return driver_register(drv);
}

struct bus_type scsi_bus_type = {
        .name		= "scsi",
        .match		= scsi_bus_match,
	.uevent		= scsi_bus_uevent,
#ifdef CONFIG_PM
	.pm		= &scsi_bus_pm_ops,
#endif
};

static struct scsi_driver sd_template = {
	.owner			= THIS_MODULE,
	.gendrv = {
		.name		= "sd",
		.probe		= sd_probe,
		.remove		= sd_remove,
		.shutdown	= sd_shutdown,
		.pm		= &sd_pm_ops,
	},
	.rescan			= sd_rescan,
	.init_command		= sd_init_command,
	.uninit_command		= sd_uninit_command,
	.done			= sd_done,
	.eh_action		= sd_eh_action,
};
struct scsi_driver {
	struct module		*owner;
	struct device_driver	gendrv;

	void (*rescan)(struct device *);
	int (*init_command)(struct scsi_cmnd *);
	void (*uninit_command)(struct scsi_cmnd *);
	int (*done)(struct scsi_cmnd *);
	int (*eh_action)(struct scsi_cmnd *, int);

	int (*scsi_mq_reserved1)(struct scsi_cmnd *);
	void (*scsi_mq_reserved2)(struct scsi_cmnd *);
	void (*rh_reserved)(void);
};
通过device_driver获取scsi_driver;
#define to_scsi_driver(drv) \
	container_of((drv), struct scsi_driver, gendrv)
	

/**
 * driver_register - register driver with bus
 * @drv: driver to register
 *
 * We pass off most of the work to the bus_add_driver() call,
 * since most of the things we have to do deal with the bus
 * structures.
 */
int driver_register(struct device_driver *drv)
{
	int ret;
	struct device_driver *other;

	BUG_ON(!drv->bus->p);

	if ((drv->bus->probe && drv->probe) ||
	    (drv->bus->remove && drv->remove) ||
	    (drv->bus->shutdown && drv->shutdown))
		printk(KERN_WARNING "Driver '%s' needs updating - please use "
			"bus_type methods\n", drv->name);

	other = driver_find(drv->name, drv->bus);
	if (other) {
		printk(KERN_ERR "Error: Driver '%s' is already registered, "
			"aborting...\n", drv->name);
		return -EBUSY;
	}
        将driver和bus做了bind。
	ret = bus_add_driver(drv);
	if (ret)
		return ret;
	ret = driver_add_groups(drv, drv->groups);
	if (ret) {
		bus_remove_driver(drv);
		return ret;
	}
	kobject_uevent(&drv->p->kobj, KOBJ_ADD);

	return ret;
}

这样device driver “sd”就和bus “scsi”绑定在一起了。
static int __init nvme_init(void)
{
	int result;

	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
	if (IS_ERR(nvme_thread))
		return PTR_ERR(nvme_thread);

	result = -ENOMEM;
	nvme_workq = create_singlethread_workqueue("nvme");
	if (!nvme_workq)
		goto kill_kthread;

	result = register_blkdev(nvme_major, "nvme");

	result = pci_register_driver(&nvme_driver);

	return 0;
}


struct block_device {

struct block_device *bd_contains;

unsignedbd_block_size;

struct hd_struct *bd_part;

/* number of times partitions within this device have been opened. */

unsignedbd_part_count;

intbd_invalidated;

struct gendisk *bd_disk;

struct request_queue *  bd_queue;

struct list_headbd_list;

};


创建sysfs相关的文件信息。
static int __init genhd_device_init(void)
{
	int error;

	block_class.dev_kobj = sysfs_dev_block_kobj;
	error = class_register(&block_class);
	if (unlikely(error))
		return error;
	bdev_map = kobj_map_init(base_probe, &block_class_lock);
	blk_dev_init();//产生一个内核线程kbockd
        .....kblockd_workqueue = alloc_workqueue("kblockd",
	.....				    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
        .....
	register_blkdev(BLOCK_EXT_MAJOR, "blkext");

	/* create top-level block dir */
	if (!sysfs_deprecated)
		block_depr = kobject_create_and_add("block", NULL);
	return 0;
}

subsys_initcall(genhd_device_init);
通过procfs创建显示文件信息
static int __init proc_genhd_init(void)
{
	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
	proc_create("partitions", 0, NULL, &proc_partitions_operations);
	return 0;
}
module_init(proc_genhd_init);

SCSI设备初始化

scsi_mod-y                      += scsi.o hosts.o scsi_ioctl.o constants.o \
                                   scsicam.o scsi_error.o scsi_lib.o
scsi_mod-$(CONFIG_SCSI_DMA)     += scsi_lib_dma.o
scsi_mod-y                      += scsi_scan.o scsi_sysfs.o scsi_devinfo.o
scsi_mod-$(CONFIG_SCSI_NETLINK) += scsi_netlink.o
scsi_mod-$(CONFIG_SYSCTL)       += scsi_sysctl.o
scsi_mod-$(CONFIG_SCSI_PROC_FS) += scsi_proc.o
scsi_mod-y                      += scsi_trace.o
scsi_mod-$(CONFIG_PM)           += scsi_pm.o
sd_mod-objs     := sd.o

Documentation/scsi/scsi.txt

Notes on using modules in the SCSI subsystem

============================================

The scsi support in the linux kernel can be modularized in a number of

different ways depending upon the needs of the end user.  To understand

your options, we should first define a few terms.


The scsi-core (also known as the "mid level") contains the core of scsi

support.  Without it you can do nothing with any of the other scsi drivers.

The scsi core support can be a module (scsi_mod.o), or it can be built into

the kernel. If the core is a module, it must be the first scsi module

loaded, and if you unload the modules, it will have to be the last one

unloaded.  In practice the modprobe and rmmod commands (and "autoclean")

will enforce the correct ordering of loading and unloading modules in

the SCSI subsystem.


The individual upper and lower level drivers can be loaded in any order

once the scsi core is present in the kernel (either compiled in or loaded

as a module).  The disk driver (sd_mod.o), cdrom driver (sr_mod.o),

tape driver ** (st.o) and scsi generics driver (sg.o) represent the upper

level drivers to support the various assorted devices which can be

controlled.  You can for example load the tape driver to use the tape drive,

and then unload it once you have no further need for the driver (and release

the associated memory).


The lower level drivers are the ones that support the individual cards that

are supported for the hardware platform that you are running under. Those

individual cards are often called Host Bus Adapters (HBAs). For example the

aic7xxx.o driver is used to control all recent SCSI controller cards from

Adaptec. Almost all lower level drivers can be built either as modules or

built into the kernel.



sd.c Linux scsi disk driver

================================================================

     HBA PROBE: assume 2 SCSI devices found in scan

LLD                   mid level                    LLD

===-------------------=========--------------------===------

scsi_host_alloc()  -->

scsi_add_host()  ---->

scsi_scan_host()  -------+



====================================================================

设备检测过程

当scsi总线上有设备加载时,会根据主设备号进行驱动匹配,如果是sd设备,那么就会调用sd_probe.

/**
 *	sd_probe - called during driver initialization and whenever a
 *	new scsi device is attached to the system. It is called once
 *	for each scsi device (not just disks) present.
 *	@dev: pointer to device object
 *	Note: this function is invoked from the scsi mid-level.
 *	This function sets up the mapping between a given 
 *	<host,channel,id,lun> (found in sdp) and new device name 
 *	(e.g. /dev/sda). More precisely it is the block device major 
 *	and minor number that is chosen here.
 **/
static int sd_probe(struct device *dev)
{
	struct scsi_device *sdp = to_scsi_device(dev);
	struct scsi_disk *sdkp;
	struct gendisk *gd;
	int index;
	int error;
        分配gendisk结构
	gd = alloc_disk(SD_MINORS);
	if (!gd)
		goto out_free;
        获取一个目前可用的最小的字符,用于生成disk name
	do {
		if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
			goto out_put;

		spin_lock(&sd_index_lock);
		error = ida_get_new(&sd_index_ida, &index);
		spin_unlock(&sd_index_lock);
	} while (error == -EAGAIN);

	error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
	生成scsi_disk结构体
	sdkp->device = sdp;
	sdkp->driver = &sd_template;
	sdkp->disk = gd;
	sdkp->index = index;
	atomic_set(&sdkp->openers, 0);
	atomic_set(&sdkp->device->ioerr_cnt, 0);
        设置scsi_device里面的request_queue->rq_timeout。
	if (!sdp->request_queue->rq_timeout) {
		if (sdp->type != TYPE_MOD)
			blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
		else
			blk_queue_rq_timeout(sdp->request_queue,
					     SD_MOD_TIMEOUT);
	}

	device_initialize(&sdkp->dev);
	sdkp->dev.parent = dev;
	sdkp->dev.class = &sd_disk_class;
	dev_set_name(&sdkp->dev, "%s", dev_name(dev));

	if (device_add(&sdkp->dev))
		goto out_free_index;

	get_device(dev);
	dev_set_drvdata(dev, sdkp);

	get_device(&sdkp->dev);	/* prevent release before async_schedule */
	async_schedule_domain(sd_probe_async, sdkp, &scsi_sd_probe_domain);

	return 0;
}


/*
 * The asynchronous part of sd_probe
 */
static void sd_probe_async(void *data, async_cookie_t cookie)
{
	struct scsi_disk *sdkp = data;
	struct scsi_device *sdp;
	struct gendisk *gd;
	u32 index;
	struct device *dev;

	sdp = sdkp->device;
	gd = sdkp->disk;
	index = sdkp->index;
	dev = &sdp->sdev_gendev;

	gd->major = sd_major((index & 0xf0) >> 4);
	gd->first_minor = ((index & 0xf) << 4) | (index & 0xfff00);
	gd->minors = SD_MINORS;

	//为gendisk 的block_device_operations 赋值
	gd->fops = &sd_fops;
	gendisk的private_data即为scsi_driver.
	gd->private_data = &sdkp->driver;
	gd->queue = sdkp->device->request_queue;

	/* defaults, until the device tells us otherwise */
	sdp->sector_size = 512;
	sdkp->capacity = 0;
	sdkp->media_present = 1;
	sdkp->write_prot = 0;
	sdkp->cache_override = 0;
	sdkp->WCE = 0;
	sdkp->RCD = 0;
	sdkp->ATO = 0;
	sdkp->first_scan = 1;
	sdkp->max_medium_access_timeouts = SD_MAX_MEDIUM_TIMEOUTS;

	sd_revalidate_disk(gd);

	gd->driverfs_dev = &sdp->sdev_gendev;
	gd->flags = GENHD_FL_EXT_DEVT;
	if (sdp->removable) {
		gd->flags |= GENHD_FL_REMOVABLE;
		gd->events |= DISK_EVENT_MEDIA_CHANGE;
	}

	blk_pm_runtime_init(sdp->request_queue, dev);
	调用add_disk,将设备加入到内核list中去。
	add_disk(gd);
	if (sdkp->capacity)
		sd_dif_config_host(sdkp);

	sd_revalidate_disk(gd);

	sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
		  sdp->removable ? "removable " : "");
	scsi_autopm_put_device(sdp);
	put_device(&sdkp->dev);
}

static const struct block_device_operations sd_fops = {
	.owner			= THIS_MODULE,
	.open			= sd_open,
	.release		= sd_release,
	.ioctl			= sd_ioctl,
	.getgeo			= sd_getgeo,
#ifdef CONFIG_COMPAT
	.compat_ioctl		= sd_compat_ioctl,
#endif
	.check_events		= sd_check_events,
	.revalidate_disk	= sd_revalidate_disk,
	.unlock_native_capacity	= sd_unlock_native_capacity,
};


===============================


介绍了设备如何添加到block层之后,下面看block如何处理request的。

=====

每个 块设备有一个request_queue,一个queue可以选择调度器去调度request。Linux实现了多种调度器,其中NOOP是最简单的。下面看一下noop是怎么工作的。


先看elevator的结构,struct elevator_ops;struct elevator_queue;struct elevator_type;以上3个数据结构是与elevator有关系的。

/*
 * identifies an elevator type, such as AS or deadline
 */
struct elevator_type
{
	/* managed by elevator core */
	struct kmem_cache *icq_cache;

	/* fields provided by elevator implementation */
	struct elevator_ops ops;
	size_t icq_size;	/* see iocontext.h */
	size_t icq_align;	/* ditto */
	struct elv_fs_entry *elevator_attrs;
	char elevator_name[ELV_NAME_MAX];
	struct module *elevator_owner;

	/* managed by elevator core */
	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
	struct list_head list;
};

/*
 * each queue has an elevator_queue associated with it
 */
struct elevator_queue
{
	struct elevator_type *type;
	void *elevator_data;
	struct kobject kobj;
	struct mutex sysfs_lock;
	unsigned int registered:1;
	DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};

elevator_ops是重要的结构,其定义了相关的操作,

struct elevator_ops
{
	elevator_merge_fn *elevator_merge_fn;
	elevator_merged_fn *elevator_merged_fn;
	elevator_merge_req_fn *elevator_merge_req_fn;
	elevator_allow_merge_fn *elevator_allow_merge_fn;
	elevator_bio_merged_fn *elevator_bio_merged_fn;

	elevator_dispatch_fn *elevator_dispatch_fn;
	elevator_add_req_fn *elevator_add_req_fn;
	elevator_activate_req_fn *elevator_activate_req_fn;
	elevator_deactivate_req_fn *elevator_deactivate_req_fn;

	elevator_completed_req_fn *elevator_completed_req_fn;

	elevator_request_list_fn *elevator_former_req_fn;
	elevator_request_list_fn *elevator_latter_req_fn;

	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */

	elevator_set_req_fn *elevator_set_req_fn;
	elevator_put_req_fn *elevator_put_req_fn;

	elevator_may_queue_fn *elevator_may_queue_fn;

	elevator_init_fn *elevator_init_fn;
	elevator_exit_fn *elevator_exit_fn;
};

elevator_merge_fn查询一个request,用于将bio并入

elevator_merge_req_fn将两个合并后的请求中多余的那个给删除

*elevator_dispatch_fn将调度器的队列最前面的元素取出,分派给request_queue中的请求队列以等候响应*

*elevator_add_req_fn将一个新的request添加进调度器的队列

elevator_queue_empty_fn检查调度器的队列是否为空

elevator_set_req_fn和elevator_put_req_fn分别在创建新请求和将请求所占的空间释放到内存时调用

*elevator_init_fn用于初始化调度器实例


typedef void (request_fn_proc) (struct request_queue *q);

使用typedef来简化函数定义。

request_fn_proc *request_fn等价为void (*request_fn) (struct request_queue *q)

int elevator_init(struct request_queue *q, char *name)
{
	struct elevator_type *e = NULL;
	int err;

	/*
	 * q->sysfs_lock must be held to provide mutual exclusion between
	 * elevator_switch() and here.
	 */
	lockdep_assert_held(&q->sysfs_lock);

	if (unlikely(q->elevator))
		return 0;

        初始化调度器时,把清空队列。
	INIT_LIST_HEAD(&q->queue_head);
	q->last_merge = NULL;
	q->end_sector = 0;
	q->boundary_rq = NULL;

	if (name) {
		e = elevator_get(name, true);
		if (!e)
			return -EINVAL;
	}

	/*
	 * Use the default elevator specified by config boot param or
	 * config option.  Don't try to load modules as we could be running
	 * off async and request_module() isn't allowed from async.
	 */
	if (!e && *chosen_elevator) {
		e = elevator_get(chosen_elevator, false);
		if (!e)
			printk(KERN_ERR "I/O scheduler %s not found\n",
							chosen_elevator);
	}

	选用默认的调度器,如果默认的不存在,使用noop;
	if (!e) {
		e = elevator_get(CONFIG_DEFAULT_IOSCHED, false);
		if (!e) {
			printk(KERN_ERR
				"Default I/O scheduler not found. " \
				"Using noop.\n");
			e = elevator_get("noop", false);
		}
	}
        调用调度器的初始化函数。
	err = e->ops.elevator_init_fn(q, e);
	return 0;
}
static struct elevator_type elevator_noop = {
	.ops = {
		.elevator_merge_req_fn		= noop_merged_requests,
		.elevator_dispatch_fn		= noop_dispatch,
		.elevator_add_req_fn		= noop_add_request,
		.elevator_former_req_fn		= noop_former_request,
		.elevator_latter_req_fn		= noop_latter_request,
		.elevator_init_fn		= noop_init_queue,
		.elevator_exit_fn		= noop_exit_queue,
	},
	.elevator_name = "noop",
	.elevator_owner = THIS_MODULE,
};
static int noop_init_queue(struct request_queue *q, struct elevator_type *e)
{
	struct noop_data *nd;
	struct elevator_queue *eq;

	在这里将elevator_queue的type进行赋值。
	eq = elevator_alloc(q, e);
	if (!eq)
		return -ENOMEM;

	nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node);
	if (!nd) {
		kobject_put(&eq->kobj);
		return -ENOMEM;
	}
	eq->elevator_data = nd;

	INIT_LIST_HEAD(&nd->queue);

	spin_lock_irq(q->queue_lock);
	将elevator_queue与request_queue绑定在一起。
	q->elevator = eq;
	spin_unlock_irq(q->queue_lock);
	return 0;
}
struct noop_data {
	struct list_head queue;
};

从这里可以看到noop调度器里也有一个queue。

struct request_queue->struct elevator_queue->void *elevator_data;;

noop添加request到其内部的队列时,加载自己队列的末尾。
static void noop_add_request(struct request_queue *q, struct request *rq)
{
	struct noop_data *nd = q->elevator->elevator_data;

	list_add_tail(&rq->queuelist, &nd->queue);
}
发送一个request;从自己内部的queue去取一个request,然后调用elv_dispatch_sort;

其实noop是没有自己的queue。

static int noop_dispatch(struct request_queue *q, int force)
{
	struct noop_data *nd = q->elevator->elevator_data;

	if (!list_empty(&nd->queue)) {
		struct request *rq;
		rq = list_entry(nd->queue.next, struct request, queuelist);
		list_del_init(&rq->queuelist);
		
		elv_dispatch_sort(q, rq);
		return 1;
	}
	return 0;
}
从struct request_queue从取一个request出来,并把这个request从链表中移除,之后调用elv_dispatch_sort(将rq插入request_queue  );

一个请求在创建到销毁的过程遵循下面三种流程

 set_req_fn ->
 i.   add_req_fn -> (merged_fn ->)* -> dispatch_fn -> activate_req_fn -> (deactivate_req_fn -> activate_req_fn ->)* -> completed_req_fn

ii.  add_req_fn -> (merged_fn ->)* -> merge_req_fn
 iii. [none]
 -> put_req_fn

/*
 * Function:    scsi_request_fn()
 *
 * Purpose:     Main strategy routine for SCSI.
 *
 * Arguments:   q       - Pointer to actual queue.
 *
 * Returns:     Nothing
 *
 * Lock status: IO request lock assumed to be held when called.
 */
static void scsi_request_fn(struct request_queue *q)
{
	struct scsi_device *sdev = q->queuedata;
	struct Scsi_Host *shost;
	struct scsi_cmnd *cmd;
	struct request *req;

	if(!get_device(&sdev->sdev_gendev))
		/* We must be tearing the block queue down already */
		return;

	/*
	 * To start with, we keep looping until the queue is empty, or until
	 * the host is no longer able to accept any more requests.
	 */
	shost = sdev->host;
	for (;;) {
		int rtn;
		/*
		 * get next queueable request.  We do this early to make sure
		 * that the request is fully prepared even if we cannot 
		 * accept it.
		 */
		req = blk_peek_request(q);
		if (!req || !scsi_dev_queue_ready(q, sdev))
			break;

		if (unlikely(!scsi_device_online(sdev))) {
			sdev_printk(KERN_ERR, sdev,
				    "rejecting I/O to offline device\n");
			scsi_kill_request(req, q);
			continue;
		}


		/*
		 * Remove the request from the request list.
		 */
		if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
			blk_start_request(req);
		sdev->device_busy++;

		spin_unlock(q->queue_lock);
		cmd = req->special;
		if (unlikely(cmd == NULL)) {
			printk(KERN_CRIT "impossible request in %s.\n"
					 "please mail a stack trace to "
					 "linux-scsi@vger.kernel.org\n",
					 __func__);
			blk_dump_rq_flags(req, "foo");
			BUG();
		}
		spin_lock(shost->host_lock);

		/*
		 * We hit this when the driver is using a host wide
		 * tag map. For device level tag maps the queue_depth check
		 * in the device ready fn would prevent us from trying
		 * to allocate a tag. Since the map is a shared host resource
		 * we add the dev to the starved list so it eventually gets
		 * a run when a tag is freed.
		 */
		if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
			if (list_empty(&sdev->starved_entry))
				list_add_tail(&sdev->starved_entry,
					      &shost->starved_list);
			goto not_ready;
		}

		if (!scsi_target_queue_ready(shost, sdev))
			goto not_ready;

		if (!scsi_host_queue_ready(q, shost, sdev))
			goto not_ready;

		scsi_target(sdev)->target_busy++;
		shost->host_busy++;

		/*
		 * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
		 *		take the lock again.
		 */
		spin_unlock_irq(shost->host_lock);

		/*
		 * Finally, initialize any error handling parameters, and set up
		 * the timers for timeouts.
		 */
		scsi_init_cmd_errh(cmd);

		/*
		 * Dispatch the command to the low-level driver.
		 */
		rtn = scsi_dispatch_cmd(cmd);
		spin_lock_irq(q->queue_lock);
		if (rtn)
			goto out_delay;
	}

	goto out;

 not_ready:
	spin_unlock_irq(shost->host_lock);

	/*
	 * lock q, handle tag, requeue req, and decrement device_busy. We
	 * must return with queue_lock held.
	 *
	 * Decrementing device_busy without checking it is OK, as all such
	 * cases (host limits or settings) should run the queue at some
	 * later time.
	 */
	spin_lock_irq(q->queue_lock);
	blk_requeue_request(q, req);
	sdev->device_busy--;
out_delay:
	if (sdev->device_busy == 0)
		blk_delay_queue(q, SCSI_QUEUE_DELAY);
out:
	/* must be careful here...if we trigger the ->remove() function
	 * we cannot be holding the q lock */
	spin_unlock_irq(q->queue_lock);
	put_device(&sdev->sdev_gendev);
	spin_lock_irq(q->queue_lock);
}
void blk_queue_bio(struct request_queue *q, struct bio *bio)
{
	const bool sync = !!(bio->bi_rw & REQ_SYNC);
	struct blk_plug *plug;
	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
	struct request *req;
	unsigned int request_count = 0;

	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);

	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
		bio_endio(bio, -EIO);
		return;
	}

	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
		spin_lock_irq(q->queue_lock);
		where = ELEVATOR_INSERT_FLUSH;
		goto get_rq;
	}

	/*
	 * Check if we can merge with the plugged list before grabbing
	 * any locks.
	 */
	if (blk_attempt_plug_merge(q, bio, &request_count))
		return;

	spin_lock_irq(q->queue_lock);

	el_ret = elv_merge(q, &req, bio);
	if (el_ret == ELEVATOR_BACK_MERGE) {
		if (bio_attempt_back_merge(q, req, bio)) {
			elv_bio_merged(q, req, bio);
			if (!attempt_back_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
		if (bio_attempt_front_merge(q, req, bio)) {
			elv_bio_merged(q, req, bio);
			if (!attempt_front_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	}

get_rq:
	/*
	 * This sync check and mask will be re-done in init_request_from_bio(),
	 * but we need to set it earlier to expose the sync flag to the
	 * rq allocator and io schedulers.
	 */
	rw_flags = bio_data_dir(bio);
	if (sync)
		rw_flags |= REQ_SYNC;

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	req = get_request(q, rw_flags, bio, GFP_NOIO);
	if (unlikely(!req)) {
		bio_endio(bio, -ENODEV);	/* @q is dead */
		goto out_unlock;
	}

	/*
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	init_request_from_bio(req, bio);

	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags))
		req->cpu = raw_smp_processor_id();

	plug = current->plug;
	if (plug) {
		/*
		 * If this is the first request added after a plug, fire
		 * of a plug trace. If others have been added before, check
		 * if we have multiple devices in this plug. If so, make a
		 * note to sort the list before dispatch.
		 */
		if (list_empty(&plug->list))
			trace_block_plug(q);
		else {
			if (request_count >= BLK_MAX_REQUEST_COUNT) {
				blk_flush_plug_list(plug, false);
				trace_block_plug(q);
			}
		}
		list_add_tail(&req->queuelist, &plug->list);
		blk_account_io_start(req, true);
	} else {
		spin_lock_irq(q->queue_lock);
		add_acct_request(q, req, where);
		__blk_run_queue(q);
out_unlock:
		spin_unlock_irq(q->queue_lock);
	}
}