1. 内核层文件读写的函数调用关系
sys _read
vfs_read
do_sync_read
f_op->aio_read
generic_file_aio_read
do_generic_file_read
mpage_readpage
do_mpage_readpage
mpage_bio_submit
submit_bio
generic_make_request
__generic_make_request
q->make_request_fn
__make_request
q->request_fn
end_request
mpage_end_io_read
光读一个文件就需要这么长的函数调用关系,而且后面还有设备驱动,如果是U盘,还涉及scsi和usb框架,这复杂度真是有点大。这也充分说明了内核中分层的思想。今天就从submit_bio说起。
void submit_bio(int rw, struct bio *bio)
{
int count = bio_sectors(bio);
bio->bi_rw |= rw;
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
*/
if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
if (rw & WRITE) {
count_vm_events(PGPGOUT, count);
} else {
task_io_account_read(bio->bi_size);
count_vm_events(PGPGIN, count);
}
if (unlikely(block_dump)) {
char b[BDEVNAME_SIZE];
printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
current->comm, task_pid_nr(current),
(rw & WRITE) ? "WRITE" : "READ",
(unsigned long long)bio->bi_sector,
bdevname(bio->bi_bdev, b),
count);
}
}
generic_make_request(bio);
}
void generic_make_request(struct bio *bio)
{
struct bio_list bio_list_on_stack;
if (current->bio_list) {
/* make_request is active */ //如果当前进程已经处于make_request状态,添加到待处理链表
bio_list_add(current->bio_list, bio);
return;
}
/* following loop may be a bit non-obvious, and so deserves some
* explanation.
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
* we assign bio_list to a pointer to the bio_list_on_stack,
* thus initialising the bio_list of new bios to be
* added. __generic_make_request may indeed add some more bios
* through a recursive call to generic_make_request. If it
* did, we find a non-NULL value in bio_list and re-enter the loop
* from the top. In this case we really did just take the bio
* of the top of the list (no pretending) and so remove it from
* bio_list, and call into __generic_make_request again.
*
* The loop was structured like this to make only one call to
* __generic_make_request (which is important as it is large and
* inlined) and to keep the structure simple.
*/
BUG_ON(bio->bi_next); //必须保证bio->bi_next是NULL
bio_list_init(&bio_list_on_stack);
current->bio_list = &bio_list_on_stack;
do {
__generic_make_request(bio);
bio = bio_list_pop(current->bio_list);
} while (bio);
current->bio_list = NULL; /* deactivate */
}
static inline void __generic_make_request(struct bio *bio)
{
struct request_queue *q;
sector_t old_sector;
int ret, nr_sectors = bio_sectors(bio);
dev_t old_dev;
int err = -EIO;
might_sleep();
if (bio_check_eod(bio, nr_sectors))
goto end_io;
/*
* Resolve the mapping until finished. (drivers are
* still free to implement/resolve their own stacking
* by explicitly returning 0)
*
* NOTE: we don't repeat the blk_size check for each new device.
* Stacking drivers are expected to know what they are doing.
*/
old_sector = -1;
old_dev = 0;
do {
char b[BDEVNAME_SIZE];
struct hd_struct *part;
q = bdev_get_queue(bio->bi_bdev);
if (unlikely(!q)) {
printk(KERN_ERR
"generic_make_request: Trying to access "
"nonexistent block-device %s (%Lu)\n",
bdevname(bio->bi_bdev, b),
(long long) bio->bi_sector);
goto end_io;
}
if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
nr_sectors > queue_max_hw_sectors(q))) {
printk(KERN_ERR "bio too big device %s (%u > %u)\n",
bdevname(bio->bi_bdev, b),
bio_sectors(bio),
queue_max_hw_sectors(q));
goto end_io;
}
if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
goto end_io;
part = bio->bi_bdev->bd_part;
if (should_fail_request(part, bio->bi_size) ||
should_fail_request(&part_to_disk(part)->part0,
bio->bi_size))
goto end_io;
/*
* If this device has partitions, remap block n
* of partition p to block n+start(p) of the disk.
*/
blk_partition_remap(bio); //如果是分区映射到磁盘
if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
goto end_io;
if (old_sector != -1)
trace_block_bio_remap(q, bio, old_dev, old_sector);
old_sector = bio->bi_sector;
old_dev = bio->bi_bdev->bd_dev;
if (bio_check_eod(bio, nr_sectors))
goto end_io;
/*
* Filter flush bio's early so that make_request based
* drivers without flush support don't have to worry
* about them.
*/
if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
if (!nr_sectors) {
err = 0;
goto end_io;
}
}
if ((bio->bi_rw & REQ_DISCARD) &&
(!blk_queue_discard(q) ||
((bio->bi_rw & REQ_SECURE) &&
!blk_queue_secdiscard(q)))) {
err = -EOPNOTSUPP;
goto end_io;
}
if (blk_throtl_bio(q, &bio))
goto end_io;
/*
* If bio = NULL, bio has been throttled and will be submitted
* later.
*/
if (!bio)
break;
trace_block_bio_queue(q, bio);
ret = q->make_request_fn(q, bio); //调用请求队列的make_request_fn回调函数
} while (ret);
return;
end_io:
bio_endio(bio, err);
}
对于scsi设备的request_queue的钩子函数在哪初始化的呢:
scsi_alloc_sdev->scsi_alloc_queue->blk_init_queue
把make_request_fn回调函数初始化为__make_request,request_fn初始化为scsi_request_fn。
对于mtdblock设备的request_queue的钩子函数在哪初始化的呢:
add_mtd_blktrans_dev->blk_init_queue
把make_request_fn回调函数初始化为__make_request,request_fn初始化为mtd_blktrans_request。
对于mmcblock设备的requet_queue的钩子函数在哪初始化的呢:
mmc_blk_alloc_req->mmc_init_queue->blk_init_queue
把make_request_fn回调函数初始化为__make_request,request_fn初始化为mmc_request。
接下来分析__make_request如何把bio排序、合并后加入到request结构中。
static int __make_request(struct request_queue *q, struct bio *bio)
{
const bool sync = !!(bio->bi_rw & REQ_SYNC);
struct blk_plug *plug;
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
/*
* low level driver can indicate that it wants pages above a
* certain limit bounced to low memory (ie for highmem, or even
* ISA dma in theory)
*/
blk_queue_bounce(q, &bio); //创建反弹缓冲区
if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
spin_lock_irq(q->queue_lock);
where = ELEVATOR_INSERT_FLUSH;
goto get_rq;
}
/*
* Check if we can merge with the plugged list before grabbing
* any locks.
*/
if (attempt_plug_merge(current, q, bio, &request_count))
goto out;
spin_lock_irq(q->queue_lock);
el_ret = elv_merge(q, &req, bio); //交给IO调度器排序合并
if (el_ret == ELEVATOR_BACK_MERGE) { //可以合并到某个request后面
if (bio_attempt_back_merge(q, req, bio)) {
if (!attempt_back_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
} else if (el_ret == ELEVATOR_FRONT_MERGE) { //可以合并到某个request前面
if (bio_attempt_front_merge(q, req, bio)) {
if (!attempt_front_merge(q, req))
elv_merged_request(q, req, el_ret);
goto out_unlock;
}
}
//在请求队列中没有找到可以合并的request
get_rq:
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
* rq allocator and io schedulers.
*/
rw_flags = bio_data_dir(bio);
if (sync)
rw_flags |= REQ_SYNC;
/*
* Grab a free request. This is might sleep but can not fail.
* Returns with the queue unlocked.
*/
req = get_request_wait(q, rw_flags, bio); //分配一个 request
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
* We don't worry about that case for efficiency. It won't happen
* often, and the elevators are able to handle it.
*/
init_request_from_bio(req, bio); //根据bio初始化request
if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
bio_flagged(bio, BIO_CPU_AFFINE))
req->cpu = raw_smp_processor_id();
plug = current->plug;
if (plug) {
/*
* If this is the first request added after a plug, fire
* of a plug trace. If others have been added before, check
* if we have multiple devices in this plug. If so, make a
* note to sort the list before dispatch.
*/
if (list_empty(&plug->list))
trace_block_plug(q);
else if (!plug->should_sort) {
struct request *__rq;
__rq = list_entry_rq(plug->list.prev);
if (__rq->q != q)
plug->should_sort = 1;
}
if (request_count >= BLK_MAX_REQUEST_COUNT)
blk_flush_plug_list(plug, false); //请求数量达到上限,进行unplug操作
list_add_tail(&req->queuelist, &plug->list);
drive_stat_acct(req, 1);
} else {
spin_lock_irq(q->queue_lock);
add_acct_request(q, req, where); //添加到io调度器中
__blk_run_queue(q); //执行unplug操作
out_unlock:
spin_unlock_irq(q->queue_lock);
}
out:
return 0;
}
void __blk_run_queue(struct request_queue *q)
{
if (unlikely(blk_queue_stopped(q)))
return;
q->request_fn(q);
}
接下来以scsi设备为例来分析q->request_fn函数
static void scsi_request_fn(struct request_queue *q)
{
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost;
struct scsi_cmnd *cmd;
struct request *req;
if (!sdev) {
printk("scsi: killing requests for dead queue\n");
while ((req = blk_peek_request(q)) != NULL)
scsi_kill_request(req, q);
return;
}
if(!get_device(&sdev->sdev_gendev))
/* We must be tearing the block queue down already */
return;
/*
* To start with, we keep looping until the queue is empty, or until
* the host is no longer able to accept any more requests.
*/
shost = sdev->host;
for (;;) {
int rtn;
/*
* get next queueable request. We do this early to make sure
* that the request is fully prepared even if we cannot
* accept it.
*/
req = blk_peek_request(q); //获得下一个请求
if (!req || !scsi_dev_queue_ready(q, sdev))
break;
if (unlikely(!scsi_device_online(sdev))) { //设备离线
sdev_printk(KERN_ERR, sdev,
"rejecting I/O to offline device\n");
scsi_kill_request(req, q);
continue;
}
/*
* Remove the request from the request list.
*/
if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
blk_start_request(req);
sdev->device_busy++;
spin_unlock(q->queue_lock);
cmd = req->special;
if (unlikely(cmd == NULL)) {
printk(KERN_CRIT "impossible request in %s.\n"
"please mail a stack trace to "
"linux-scsi@vger.kernel.org\n",
__func__);
blk_dump_rq_flags(req, "foo");
BUG();
}
spin_lock(shost->host_lock);
/*
* We hit this when the driver is using a host wide
* tag map. For device level tag maps the queue_depth check
* in the device ready fn would prevent us from trying
* to allocate a tag. Since the map is a shared host resource
* we add the dev to the starved list so it eventually gets
* a run when a tag is freed.
*/
if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
if (list_empty(&sdev->starved_entry))
list_add_tail(&sdev->starved_entry,
&shost->starved_list);
goto not_ready;
}
if (!scsi_target_queue_ready(shost, sdev)) //是否可以发送命令到目标节点
goto not_ready;
if (!scsi_host_queue_ready(q, shost, sdev)) //是否可以发送到主机适配器
goto not_ready;
scsi_target(sdev)->target_busy++;
shost->host_busy++;
/*
* XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
* take the lock again.
*/
spin_unlock_irq(shost->host_lock);
/*
* Finally, initialize any error handling parameters, and set up
* the timers for timeouts.
*/
scsi_init_cmd_errh(cmd);
/*
* Dispatch the command to the low-level driver.
*/
rtn = scsi_dispatch_cmd(cmd); //派发命令到底层驱动
spin_lock_irq(q->queue_lock);
if (rtn)
goto out_delay;
}
goto out;
not_ready:
spin_unlock_irq(shost->host_lock);
/*
* lock q, handle tag, requeue req, and decrement device_busy. We
* must return with queue_lock held.
*
* Decrementing device_busy without checking it is OK, as all such
* cases (host limits or settings) should run the queue at some
* later time.
*/
spin_lock_irq(q->queue_lock);
blk_requeue_request(q, req);
sdev->device_busy--;
out_delay:
if (sdev->device_busy == 0)
blk_delay_queue(q, SCSI_QUEUE_DELAY);
out:
/* must be careful here...if we trigger the ->remove() function
* we cannot be holding the q lock */
spin_unlock_irq(q->queue_lock);
put_device(&sdev->sdev_gendev);
spin_lock_irq(q->queue_lock);
}
blk_peek_request主要调用了__elv_next_request和q->prep_rq_fn(q, rq),而prep_rq_fn被赋值为sd_prep_fn,主要作用是为请求构造scsi命令。
int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
struct Scsi_Host *host = cmd->device->host;
unsigned long timeout;
int rtn = 0;
atomic_inc(&cmd->device->iorequest_cnt);
/* check if the device is still usable */
if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
/* in SDEV_DEL we error all commands. DID_NO_CONNECT
* returns an immediate error upwards, and signals
* that the device is no longer present */
cmd->result = DID_NO_CONNECT << 16;
scsi_done(cmd);
/* return 0 (because the command has been processed) */
goto out;
}
/* Check to see if the scsi lld made this device blocked. */
if (unlikely(scsi_device_blocked(cmd->device))) {
/*
* in blocked state, the command is just put back on
* the device queue. The suspend state has already
* blocked the queue so future requests should not
* occur until the device transitions out of the
* suspend state.
*/
scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);
SCSI_LOG_MLQUEUE(3, printk("queuecommand : device blocked \n"));
/*
* NOTE: rtn is still zero here because we don't need the
* queue to be plugged on return (it's already stopped)
*/
goto out;
}
/*
* If SCSI-2 or lower, store the LUN value in cmnd.
*/
if (cmd->device->scsi_level <= SCSI_2 &&
cmd->device->scsi_level != SCSI_UNKNOWN) {
cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
(cmd->device->lun << 5 & 0xe0);
}
/*
* We will wait MIN_RESET_DELAY clock ticks after the last reset so
* we can avoid the drive not being ready.
*/
timeout = host->last_reset + MIN_RESET_DELAY;
if (host->resetting && time_before(jiffies, timeout)) {
int ticks_remaining = timeout - jiffies;
/*
* NOTE: This may be executed from within an interrupt
* handler! This is bad, but for now, it'll do. The irq
* level of the interrupt handler has been masked out by the
* platform dependent interrupt handling code already, so the
* sti() here will not cause another call to the SCSI host's
* interrupt handler (assuming there is one irq-level per
* host).
*/
while (--ticks_remaining >= 0)
mdelay(1 + 999 / HZ);
host->resetting = 0;
}
scsi_log_send(cmd);
/*
* Before we queue this command, check if the command
* length exceeds what the host adapter can handle.
*/
if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
SCSI_LOG_MLQUEUE(3,
printk("queuecommand : command too long. "
"cdb_size=%d host->max_cmd_len=%d\n",
cmd->cmd_len, cmd->device->host->max_cmd_len));
cmd->result = (DID_ABORT << 16);
scsi_done(cmd);
goto out;
}
if (unlikely(host->shost_state == SHOST_DEL)) {
cmd->result = (DID_NO_CONNECT << 16);
scsi_done(cmd);
} else {
trace_scsi_dispatch_cmd_start(cmd);
cmd->scsi_done = scsi_done;
rtn = host->hostt->queuecommand(host, cmd); //调用主机适配器的queuecommand回调函数
}
if (rtn) {
trace_scsi_dispatch_cmd_error(cmd, rtn);
if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
rtn != SCSI_MLQUEUE_TARGET_BUSY)
rtn = SCSI_MLQUEUE_HOST_BUSY;
scsi_queue_insert(cmd, rtn);
SCSI_LOG_MLQUEUE(3,
printk("queuecommand : request rejected\n"));
}
out:
SCSI_LOG_MLQUEUE(3, printk("leaving scsi_dispatch_cmnd()\n"));
return rtn;
}