block IO层框架分析2

1. 内核层文件读写的函数调用关系

sys _read

vfs_read

do_sync_read

f_op->aio_read

generic_file_aio_read

do_generic_file_read

mpage_readpage

do_mpage_readpage

mpage_bio_submit

submit_bio

generic_make_request

__generic_make_request

q->make_request_fn

__make_request

q->request_fn

end_request

mpage_end_io_read

光读一个文件就需要这么长的函数调用关系,而且后面还有设备驱动,如果是U盘,还涉及scsi和usb框架,这复杂度真是有点大。这也充分说明了内核中分层的思想。今天就从submit_bio说起。

void submit_bio(int rw, struct bio *bio)
{
	int count = bio_sectors(bio);

	bio->bi_rw |= rw;

	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
		if (rw & WRITE) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
			current->comm, task_pid_nr(current),
				(rw & WRITE) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_sector,
				bdevname(bio->bi_bdev, b),
				count);
		}
	}

	generic_make_request(bio);
}
void generic_make_request(struct bio *bio)
{
	struct bio_list bio_list_on_stack;

	if (current->bio_list) {
		/* make_request is active */	//如果当前进程已经处于make_request状态,添加到待处理链表
		bio_list_add(current->bio_list, bio);
		return;
	}
	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to a pointer to the bio_list_on_stack,
	 * thus initialising the bio_list of new bios to be
	 * added.  __generic_make_request may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so remove it from
	 * bio_list, and call into __generic_make_request again.
	 *
	 * The loop was structured like this to make only one call to
	 * __generic_make_request (which is important as it is large and
	 * inlined) and to keep the structure simple.
	 */
	BUG_ON(bio->bi_next);	//必须保证bio->bi_next是NULL
	bio_list_init(&bio_list_on_stack);
	current->bio_list = &bio_list_on_stack;
	do {
		__generic_make_request(bio);
		bio = bio_list_pop(current->bio_list);
	} while (bio);
	current->bio_list = NULL; /* deactivate */
}
static inline void __generic_make_request(struct bio *bio)
{
	struct request_queue *q;
	sector_t old_sector;
	int ret, nr_sectors = bio_sectors(bio);
	dev_t old_dev;
	int err = -EIO;

	might_sleep();

	if (bio_check_eod(bio, nr_sectors))
		goto end_io;

	/*
	 * Resolve the mapping until finished. (drivers are
	 * still free to implement/resolve their own stacking
	 * by explicitly returning 0)
	 *
	 * NOTE: we don't repeat the blk_size check for each new device.
	 * Stacking drivers are expected to know what they are doing.
	 */
	old_sector = -1;
	old_dev = 0;
	do {
		char b[BDEVNAME_SIZE];
		struct hd_struct *part;

		q = bdev_get_queue(bio->bi_bdev);
		if (unlikely(!q)) {
			printk(KERN_ERR
			       "generic_make_request: Trying to access "
				"nonexistent block-device %s (%Lu)\n",
				bdevname(bio->bi_bdev, b),
				(long long) bio->bi_sector);
			goto end_io;
		}

		if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
			     nr_sectors > queue_max_hw_sectors(q))) {
			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
			       bdevname(bio->bi_bdev, b),
			       bio_sectors(bio),
			       queue_max_hw_sectors(q));
			goto end_io;
		}

		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
			goto end_io;

		part = bio->bi_bdev->bd_part;
		if (should_fail_request(part, bio->bi_size) ||
		    should_fail_request(&part_to_disk(part)->part0,
					bio->bi_size))
			goto end_io;

		/*
		 * If this device has partitions, remap block n
		 * of partition p to block n+start(p) of the disk.
		 */
		blk_partition_remap(bio);	//如果是分区映射到磁盘

		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
			goto end_io;

		if (old_sector != -1)
			trace_block_bio_remap(q, bio, old_dev, old_sector);

		old_sector = bio->bi_sector;
		old_dev = bio->bi_bdev->bd_dev;

		if (bio_check_eod(bio, nr_sectors))
			goto end_io;

		/*
		 * Filter flush bio's early so that make_request based
		 * drivers without flush support don't have to worry
		 * about them.
		 */
		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
			if (!nr_sectors) {
				err = 0;
				goto end_io;
			}
		}

		if ((bio->bi_rw & REQ_DISCARD) &&
		    (!blk_queue_discard(q) ||
		     ((bio->bi_rw & REQ_SECURE) &&
		      !blk_queue_secdiscard(q)))) {
			err = -EOPNOTSUPP;
			goto end_io;
		}

		if (blk_throtl_bio(q, &bio))
			goto end_io;

		/*
		 * If bio = NULL, bio has been throttled and will be submitted
		 * later.
		 */
		if (!bio)
			break;

		trace_block_bio_queue(q, bio);

		ret = q->make_request_fn(q, bio);	//调用请求队列的make_request_fn回调函数
	} while (ret);

	return;

end_io:
	bio_endio(bio, err);
}

对于scsi设备的request_queue的钩子函数在哪初始化的呢:

scsi_alloc_sdev->scsi_alloc_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request,request_fn初始化为scsi_request_fn。


对于mtdblock设备的request_queue的钩子函数在哪初始化的呢:

add_mtd_blktrans_dev->blk_init_queue

把make_request_fn回调函数初始化为__make_request,request_fn初始化为mtd_blktrans_request。


对于mmcblock设备的requet_queue的钩子函数在哪初始化的呢:

mmc_blk_alloc_req->mmc_init_queue->blk_init_queue

把make_request_fn回调函数初始化为__make_request,request_fn初始化为mmc_request。


接下来分析__make_request如何把bio排序、合并后加入到request结构中。

static int __make_request(struct request_queue *q, struct bio *bio)
{
	const bool sync = !!(bio->bi_rw & REQ_SYNC);
	struct blk_plug *plug;
	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
	struct request *req;
	unsigned int request_count = 0;

	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);	//创建反弹缓冲区

	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
		spin_lock_irq(q->queue_lock);
		where = ELEVATOR_INSERT_FLUSH;
		goto get_rq;
	}

	/*
	 * Check if we can merge with the plugged list before grabbing
	 * any locks.
	 */
	if (attempt_plug_merge(current, q, bio, &request_count))
		goto out;

	spin_lock_irq(q->queue_lock);

	el_ret = elv_merge(q, &req, bio);	//交给IO调度器排序合并
	if (el_ret == ELEVATOR_BACK_MERGE) {	//可以合并到某个request后面
		if (bio_attempt_back_merge(q, req, bio)) {
			if (!attempt_back_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	} else if (el_ret == ELEVATOR_FRONT_MERGE) {	//可以合并到某个request前面
		if (bio_attempt_front_merge(q, req, bio)) {
			if (!attempt_front_merge(q, req))
				elv_merged_request(q, req, el_ret);
			goto out_unlock;
		}
	}
	//在请求队列中没有找到可以合并的request
get_rq:
	/*
	 * This sync check and mask will be re-done in init_request_from_bio(),
	 * but we need to set it earlier to expose the sync flag to the
	 * rq allocator and io schedulers.
	 */
	rw_flags = bio_data_dir(bio);
	if (sync)
		rw_flags |= REQ_SYNC;

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	req = get_request_wait(q, rw_flags, bio);	//分配一个 request

	/*
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	init_request_from_bio(req, bio);	//根据bio初始化request

	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
	    bio_flagged(bio, BIO_CPU_AFFINE))
		req->cpu = raw_smp_processor_id();

	plug = current->plug;
	if (plug) {
		/*
		 * If this is the first request added after a plug, fire
		 * of a plug trace. If others have been added before, check
		 * if we have multiple devices in this plug. If so, make a
		 * note to sort the list before dispatch.
		 */
		if (list_empty(&plug->list))
			trace_block_plug(q);
		else if (!plug->should_sort) {
			struct request *__rq;

			__rq = list_entry_rq(plug->list.prev);
			if (__rq->q != q)
				plug->should_sort = 1;
		}
		if (request_count >= BLK_MAX_REQUEST_COUNT)
			blk_flush_plug_list(plug, false);	//请求数量达到上限,进行unplug操作
		list_add_tail(&req->queuelist, &plug->list);
		drive_stat_acct(req, 1);
	} else {
		spin_lock_irq(q->queue_lock);
		add_acct_request(q, req, where);	//添加到io调度器中
		__blk_run_queue(q);	//执行unplug操作
out_unlock:
		spin_unlock_irq(q->queue_lock);
	}
out:
	return 0;
}
void __blk_run_queue(struct request_queue *q)
{
	if (unlikely(blk_queue_stopped(q)))
		return;

	q->request_fn(q);
}

接下来以scsi设备为例来分析q->request_fn函数

static void scsi_request_fn(struct request_queue *q)
{
	struct scsi_device *sdev = q->queuedata;
	struct Scsi_Host *shost;
	struct scsi_cmnd *cmd;
	struct request *req;

	if (!sdev) {
		printk("scsi: killing requests for dead queue\n");
		while ((req = blk_peek_request(q)) != NULL)
			scsi_kill_request(req, q);
		return;
	}

	if(!get_device(&sdev->sdev_gendev))
		/* We must be tearing the block queue down already */
		return;

	/*
	 * To start with, we keep looping until the queue is empty, or until
	 * the host is no longer able to accept any more requests.
	 */
	shost = sdev->host;
	for (;;) {
		int rtn;
		/*
		 * get next queueable request.  We do this early to make sure
		 * that the request is fully prepared even if we cannot 
		 * accept it.
		 */
		req = blk_peek_request(q);	//获得下一个请求
		if (!req || !scsi_dev_queue_ready(q, sdev))
			break;

		if (unlikely(!scsi_device_online(sdev))) {	//设备离线
			sdev_printk(KERN_ERR, sdev,
				    "rejecting I/O to offline device\n");
			scsi_kill_request(req, q);
			continue;
		}


		/*
		 * Remove the request from the request list.
		 */
		if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
			blk_start_request(req);
		sdev->device_busy++;

		spin_unlock(q->queue_lock);
		cmd = req->special;
		if (unlikely(cmd == NULL)) {
			printk(KERN_CRIT "impossible request in %s.\n"
					 "please mail a stack trace to "
					 "linux-scsi@vger.kernel.org\n",
					 __func__);
			blk_dump_rq_flags(req, "foo");
			BUG();
		}
		spin_lock(shost->host_lock);

		/*
		 * We hit this when the driver is using a host wide
		 * tag map. For device level tag maps the queue_depth check
		 * in the device ready fn would prevent us from trying
		 * to allocate a tag. Since the map is a shared host resource
		 * we add the dev to the starved list so it eventually gets
		 * a run when a tag is freed.
		 */
		if (blk_queue_tagged(q) && !blk_rq_tagged(req)) {
			if (list_empty(&sdev->starved_entry))
				list_add_tail(&sdev->starved_entry,
					      &shost->starved_list);
			goto not_ready;
		}

		if (!scsi_target_queue_ready(shost, sdev))	//是否可以发送命令到目标节点
			goto not_ready;

		if (!scsi_host_queue_ready(q, shost, sdev))		//是否可以发送到主机适配器
			goto not_ready;

		scsi_target(sdev)->target_busy++;
		shost->host_busy++;

		/*
		 * XXX(hch): This is rather suboptimal, scsi_dispatch_cmd will
		 *		take the lock again.
		 */
		spin_unlock_irq(shost->host_lock);

		/*
		 * Finally, initialize any error handling parameters, and set up
		 * the timers for timeouts.
		 */
		scsi_init_cmd_errh(cmd);

		/*
		 * Dispatch the command to the low-level driver.
		 */
		rtn = scsi_dispatch_cmd(cmd);	//派发命令到底层驱动
		spin_lock_irq(q->queue_lock);
		if (rtn)
			goto out_delay;
	}

	goto out;

 not_ready:
	spin_unlock_irq(shost->host_lock);

	/*
	 * lock q, handle tag, requeue req, and decrement device_busy. We
	 * must return with queue_lock held.
	 *
	 * Decrementing device_busy without checking it is OK, as all such
	 * cases (host limits or settings) should run the queue at some
	 * later time.
	 */
	spin_lock_irq(q->queue_lock);
	blk_requeue_request(q, req);
	sdev->device_busy--;
out_delay:
	if (sdev->device_busy == 0)
		blk_delay_queue(q, SCSI_QUEUE_DELAY);
out:
	/* must be careful here...if we trigger the ->remove() function
	 * we cannot be holding the q lock */
	spin_unlock_irq(q->queue_lock);
	put_device(&sdev->sdev_gendev);
	spin_lock_irq(q->queue_lock);
}
blk_peek_request主要调用了__elv_next_request和q->prep_rq_fn(q, rq),而prep_rq_fn被赋值为sd_prep_fn,主要作用是为请求构造scsi命令。

int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
	struct Scsi_Host *host = cmd->device->host;
	unsigned long timeout;
	int rtn = 0;

	atomic_inc(&cmd->device->iorequest_cnt);

	/* check if the device is still usable */
	if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
		/* in SDEV_DEL we error all commands. DID_NO_CONNECT
		 * returns an immediate error upwards, and signals
		 * that the device is no longer present */
		cmd->result = DID_NO_CONNECT << 16;
		scsi_done(cmd);
		/* return 0 (because the command has been processed) */
		goto out;
	}

	/* Check to see if the scsi lld made this device blocked. */
	if (unlikely(scsi_device_blocked(cmd->device))) {
		/* 
		 * in blocked state, the command is just put back on
		 * the device queue.  The suspend state has already
		 * blocked the queue so future requests should not
		 * occur until the device transitions out of the
		 * suspend state.
		 */

		scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY);

		SCSI_LOG_MLQUEUE(3, printk("queuecommand : device blocked \n"));

		/*
		 * NOTE: rtn is still zero here because we don't need the
		 * queue to be plugged on return (it's already stopped)
		 */
		goto out;
	}

	/* 
	 * If SCSI-2 or lower, store the LUN value in cmnd.
	 */
	if (cmd->device->scsi_level <= SCSI_2 &&
	    cmd->device->scsi_level != SCSI_UNKNOWN) {
		cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
			       (cmd->device->lun << 5 & 0xe0);
	}

	/*
	 * We will wait MIN_RESET_DELAY clock ticks after the last reset so
	 * we can avoid the drive not being ready.
	 */
	timeout = host->last_reset + MIN_RESET_DELAY;

	if (host->resetting && time_before(jiffies, timeout)) {
		int ticks_remaining = timeout - jiffies;
		/*
		 * NOTE: This may be executed from within an interrupt
		 * handler!  This is bad, but for now, it'll do.  The irq
		 * level of the interrupt handler has been masked out by the
		 * platform dependent interrupt handling code already, so the
		 * sti() here will not cause another call to the SCSI host's
		 * interrupt handler (assuming there is one irq-level per
		 * host).
		 */
		while (--ticks_remaining >= 0)
			mdelay(1 + 999 / HZ);
		host->resetting = 0;
	}

	scsi_log_send(cmd);

	/*
	 * Before we queue this command, check if the command
	 * length exceeds what the host adapter can handle.
	 */
	if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
		SCSI_LOG_MLQUEUE(3,
			printk("queuecommand : command too long. "
			       "cdb_size=%d host->max_cmd_len=%d\n",
			       cmd->cmd_len, cmd->device->host->max_cmd_len));
		cmd->result = (DID_ABORT << 16);

		scsi_done(cmd);
		goto out;
	}

	if (unlikely(host->shost_state == SHOST_DEL)) {
		cmd->result = (DID_NO_CONNECT << 16);
		scsi_done(cmd);
	} else {
		trace_scsi_dispatch_cmd_start(cmd);
		cmd->scsi_done = scsi_done;
		rtn = host->hostt->queuecommand(host, cmd);	//调用主机适配器的queuecommand回调函数
	}

	if (rtn) {
		trace_scsi_dispatch_cmd_error(cmd, rtn);
		if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
		    rtn != SCSI_MLQUEUE_TARGET_BUSY)
			rtn = SCSI_MLQUEUE_HOST_BUSY;

		scsi_queue_insert(cmd, rtn);

		SCSI_LOG_MLQUEUE(3,
		    printk("queuecommand : request rejected\n"));
	}

 out:
	SCSI_LOG_MLQUEUE(3, printk("leaving scsi_dispatch_cmnd()\n"));
	return rtn;
}

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值