上文中,我们将提交了submit_bio后的流程进行了分析,经过梳理之后,一个submit_bio在经过层层转化,最后进入到硬件派发队列hctx时,会调用硬件绑定的驱动所提供的queue_rq函数,这个函数是每一个硬件派发队列对应驱动都必须提供的。
我们之前所研究的virtio部分是挂在pci总线上面的,然而在多数情况下,硬盘是通过scsi子系统被发现并管理的,磁盘驱动为scsi子系统中的一种高级驱动sd。
因此,根据我们现有的知识,在mq体系下,scsi一定向上层提供了对应的硬件派发队列以及对应的queue_rq函数。
我们通过简单地搜索,就能找到scsi所绑定的blk_mq_op:
//common/drivers/scsi/scsi_lib.c
static const struct blk_mq_ops scsi_mq_ops = {
.get_budget = scsi_mq_get_budget,
.put_budget = scsi_mq_put_budget,
.queue_rq = scsi_queue_rq,
.commit_rqs = scsi_commit_rqs,
.complete = scsi_complete,
.timeout = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
.show_rq = scsi_show_rq,
#endif
.init_request = scsi_mq_init_request,
.exit_request = scsi_mq_exit_request,
.cleanup_rq = scsi_cleanup_rq,
.busy = scsi_mq_lld_busy,
.map_queues = scsi_map_queues,
.init_hctx = scsi_init_hctx,
.poll = scsi_mq_poll,
.set_rq_budget_token = scsi_mq_set_rq_budget_token,
.get_rq_budget_token = scsi_mq_get_rq_budget_token,
};
找到scsi_queue_rq函数:
//common/drivers/scsi/scsi_lib.c
static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
struct request *req = bd->rq;
struct request_queue *q = req->q;
struct scsi_device *sdev = q->queuedata;
struct Scsi_Host *shost = sdev->host;
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
blk_status_t ret;
int reason;
WARN_ON_ONCE(cmd->budget_token < 0);
/*
* If the device is not in running state we will reject some or all
* commands.
*/
if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
ret = scsi_device_state_check(sdev, req);
if (ret != BLK_STS_OK)
goto out_put_budget;
}
ret = BLK_STS_RESOURCE;
if (!scsi_target_queue_ready(shost, sdev))
goto out_put_budget;
if (unlikely(scsi_host_in_recovery(shost))) {
if (cmd->flags & SCMD_FAIL_IF_RECOVERING)
ret = BLK_STS_OFFLINE;
goto out_dec_target_busy;
}
if (!scsi_host_queue_ready(q, shost, sdev, cmd))
goto out_dec_target_busy;
if (!(req->rq_flags & RQF_DONTPREP)) {
ret = scsi_prepare_cmd(req);
if (ret != BLK_STS_OK)
goto out_dec_host_busy;
req->rq_flags |= RQF_DONTPREP;
} else {
clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
}
cmd->flags &= SCMD_PRESERVED_FLAGS;
if (sdev->simple_tags)
cmd->flags |= SCMD_TAGGED;
if (bd->last)
cmd->flags |= SCMD_LAST;
scsi_set_resid(cmd, 0);
memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
cmd->submitter = SUBMITTED_BY_BLOCK_LAYER;
blk_mq_start_request(req);
reason = scsi_dispatch_cmd(cmd);
if (reason) {
scsi_set_blocked(cmd, reason);
ret = BLK_STS_RESOURCE;
goto out_dec_host_busy;
}
atomic_inc(&cmd->device->iorequest_cnt);
return BLK_STS_OK;
out_dec_host_busy:
scsi_dec_host_busy(shost, cmd);
out_dec_target_busy:
if (scsi_target(sdev)->can_queue > 0)
atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
scsi_mq_put_budget(q, cmd->budget_token);
cmd->budget_token = -1;
switch (ret) {
case BLK_STS_OK:
break;
case BLK_STS_RESOURCE:
case BLK_STS_ZONE_RESOURCE:
if (scsi_device_blocked(sdev))
ret = BLK_STS_DEV_RESOURCE;
break;
case BLK_STS_AGAIN:
cmd->result = DID_BUS_BUSY << 16;
if (req->rq_flags & RQF_DONTPREP)
scsi_mq_uninit_cmd(cmd);
break;
default:
if (unlikely(!scsi_device_online(sdev)))
cmd->result = DID_NO_CONNECT << 16;
else
cmd->result = DID_ERROR << 16;
/*
* Make sure to release all allocated resources when
* we hit an error, as we will never see this command
* again.
*/
if (req->rq_flags & RQF_DONTPREP)
scsi_mq_uninit_cmd(cmd);
scsi_run_queue_async(sdev);
break;
}
return ret;
}
这里面虽然没有和zone有关的内容,但是不要慌张,浅浅地回忆一下,进入到这个函数之后大致要完成的工作是,把队列中的request再转化为对硬件的command,接着下发command到硬件,完成io。
也就是说,对于request的解析,一定是在command生成之前的。
在上面代码的35行之前,是在做一些必要的检查,确保队列、硬件处于正常工作的状态,接着37行,出现一个关键的函数scsi_prepare_cmd,顾名思义,command可能会在这个函数中进行初始化。
所以我们进入到这个函数:
//common/drivers/scsi/scsi_lib.c
static blk_status_t scsi_prepare_cmd(struct request *req)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
struct scsi_device *sdev = req->q->queuedata;
struct Scsi_Host *shost = sdev->host;
bool in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
struct scatterlist *sg;
scsi_init_command(sdev, cmd);
cmd->eh_eflags = 0;
cmd->prot_type = 0;
cmd->prot_flags = 0;
cmd->submitter = 0;
memset(&cmd->sdb, 0, sizeof(cmd->sdb));
cmd->underflow = 0;
cmd->transfersize = 0;
cmd->host_scribble = NULL;
cmd->result = 0;
cmd->extra_len = 0;
cmd->state = 0;
if (in_flight)
__set_bit(SCMD_STATE_INFLIGHT, &cmd->state);
/*
* Only clear the driver-private command data if the LLD does not supply
* a function to initialize that data.
*/
if (!shost->hostt->init_cmd_priv)
memset(cmd + 1, 0, shost->hostt->cmd_size);
cmd->prot_op = SCSI_PROT_NORMAL;
if (blk_rq_bytes(req))
cmd->sc_data_direction = rq_dma_dir(req);
else
cmd->sc_data_direction = DMA_NONE;
sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
cmd->sdb.table.sgl = sg;
if (scsi_host_get_prot(shost)) {
memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));
cmd->prot_sdb->table.sgl =
(struct scatterlist *)(cmd->prot_sdb + 1);
}
/*
* Special handling for passthrough commands, which don't go to the ULP
* at all:
*/
if (blk_rq_is_passthrough(req))
return scsi_setup_scsi_cmnd(sdev, req);
if (sdev->handler && sdev->handler->prep_fn) {
blk_status_t ret = sdev->handler->prep_fn(sdev, req);
if (ret != BLK_STS_OK)
return ret;
}
/* Usually overridden by the ULP */
cmd->allowed = 0;
memset(cmd->cmnd, 0, sizeof(cmd->cmnd));
return scsi_cmd_to_driver(cmd)->init_command(cmd);
}
要知道scsi层里面,高级驱动可不止sd一个,因此,我们可以猜测这个函数只是在做一些通用性的命令初始化,对于特异性的初始化,一定会转交sd驱动处理,所以直接看代码的66行,调用了对应cmd绑定驱动的init_command函数。
让我们看看sd是如何处理的:
//common/drivers/scsi/sd.c
static struct scsi_driver sd_template = {
.gendrv = {
.name = "sd",
.owner = THIS_MODULE,
.probe = sd_probe,
.probe_type = PROBE_PREFER_ASYNCHRONOUS,
.remove = sd_remove,
.shutdown = sd_shutdown,
.pm = &sd_pm_ops,
},
.rescan = sd_rescan,
.init_command = sd_init_command,
.uninit_command = sd_uninit_command,
.done = sd_done,
.eh_action = sd_eh_action,
.eh_reset = sd_eh_reset,
};
上面的结构体描述了一个scsi_driver的一系列回调函数,关于scsi我们之后再系统的学习,这里我们先去了解它的init_command函数:
//common/drivers/scsi/sd.c
static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
{
struct request *rq = scsi_cmd_to_rq(cmd);
switch (req_op(rq)) {
case REQ_OP_DISCARD:
switch (scsi_disk(rq->q->disk)->provisioning_mode) {
case SD_LBP_UNMAP:
return sd_setup_unmap_cmnd(cmd);
case SD_LBP_WS16:
return sd_setup_write_same16_cmnd(cmd, true);
case SD_LBP_WS10:
return sd_setup_write_same10_cmnd(cmd, true);
case SD_LBP_ZERO:
return sd_setup_write_same10_cmnd(cmd, false);
default:
return BLK_STS_TARGET;
}
case REQ_OP_WRITE_ZEROES:
return sd_setup_write_zeroes_cmnd(cmd);
case REQ_OP_FLUSH:
return sd_setup_flush_cmnd(cmd);
case REQ_OP_READ:
case REQ_OP_WRITE:
case REQ_OP_ZONE_APPEND:
return sd_setup_read_write_cmnd(cmd);
case REQ_OP_ZONE_RESET:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
false);
case REQ_OP_ZONE_RESET_ALL:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
true);
case REQ_OP_ZONE_OPEN:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false);
case REQ_OP_ZONE_CLOSE:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false);
case REQ_OP_ZONE_FINISH:
return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false);
default:
WARN_ON_ONCE(1);
return BLK_STS_NOTSUPP;
}
}
到这里之后,完成了将request转化为command的操作,在此处REQ_OP的作用是将不同的操作正确生成了Command命令。
之后,系统会在scsi_dispatch_cmd函数中,将准备好的command下发到低层驱动。
static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
struct Scsi_Host *host = cmd->device->host;
int rtn = 0;
atomic_inc(&cmd->device->iorequest_cnt);
/* check if the device is still usable */
if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
/* in SDEV_DEL we error all commands. DID_NO_CONNECT
* returns an immediate error upwards, and signals
* that the device is no longer present */
cmd->result = DID_NO_CONNECT << 16;
goto done;
}
/* Check to see if the scsi lld made this device blocked. */
if (unlikely(scsi_device_blocked(cmd->device))) {
/*
* in blocked state, the command is just put back on
* the device queue. The suspend state has already
* blocked the queue so future requests should not
* occur until the device transitions out of the
* suspend state.
*/
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : device blocked\n"));
return SCSI_MLQUEUE_DEVICE_BUSY;
}
/* Store the LUN value in cmnd, if needed. */
if (cmd->device->lun_in_cdb)
cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
(cmd->device->lun << 5 & 0xe0);
scsi_log_send(cmd);
/*
* Before we queue this command, check if the command
* length exceeds what the host adapter can handle.
*/
if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : command too long. "
"cdb_size=%d host->max_cmd_len=%d\n",
cmd->cmd_len, cmd->device->host->max_cmd_len));
cmd->result = (DID_ABORT << 16);
goto done;
}
if (unlikely(host->shost_state == SHOST_DEL)) {
cmd->result = (DID_NO_CONNECT << 16);
goto done;
}
trace_scsi_dispatch_cmd_start(cmd);
rtn = host->hostt->queuecommand(host, cmd);
if (rtn) {
trace_scsi_dispatch_cmd_error(cmd, rtn);
if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
rtn != SCSI_MLQUEUE_TARGET_BUSY)
rtn = SCSI_MLQUEUE_HOST_BUSY;
SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
"queuecommand : request rejected\n"));
}
return rtn;
done:
cmd->scsi_done(cmd);
return 0;
}
queuecommand是低层驱动中一个最关键的函数,将在之后学习scsi的时候再细说,在这个函数中,cmd->scsi_done函数会在所有操作完成后返回前被调用,向上层报告结果。这个函数则是在scsi_queue_rq函数中被定义为scsi_mq_done。
cmd->scsi_done = scsi_mq_done;
//common/drivers/scsi/scsi_lib.c
static void scsi_mq_done(struct scsi_cmnd *cmd)
{
if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q)))
return;
if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
return;
trace_scsi_dispatch_cmd_done(cmd);
blk_mq_complete_request(scsi_cmd_to_rq(cmd));
}
//common/block/blk-mq.c
void blk_mq_complete_request(struct request *rq)
{
if (!blk_mq_complete_request_remote(rq))
rq->q->mq_ops->complete(rq);
}
最终调用了mq_ops里的complete函数。
sd_zbc.c里同样提供了相关实现:
//common/drivers/scsi/sd_zbc.c
unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
struct scsi_sense_hdr *sshdr)
{
int result = cmd->result;
struct request *rq = scsi_cmd_to_rq(cmd);
if (op_is_zone_mgmt(req_op(rq)) &&
result &&
sshdr->sense_key == ILLEGAL_REQUEST &&
sshdr->asc == 0x24) {
/*
* INVALID FIELD IN CDB error: a zone management command was
* attempted on a conventional zone. Nothing to worry about,
* so be quiet about the error. 允许区块管理命令操作传统区块
*/
rq->rq_flags |= RQF_QUIET;
} else if (sd_zbc_need_zone_wp_update(rq))
good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
// 更新块指针
if (req_op(rq) == REQ_OP_ZONE_APPEND)
blk_req_zone_write_unlock(rq);
// 对于写追加操作的特殊处理
return good_bytes;
}
完成函数中,根据下层返回的结果,移动写指针到正确的位置上。
static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
unsigned int good_bytes)
{
int result = cmd->result;
struct request *rq = scsi_cmd_to_rq(cmd);
struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
unsigned int zno = blk_rq_zone_no(rq);
enum req_opf op = req_op(rq);
unsigned long flags;
/*
* If we got an error for a command that needs updating the write
* pointer offset cache, we must mark the zone wp offset entry as
* invalid to force an update from disk the next time a zone append
* command is issued.
*/
spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);
if (result && op != REQ_OP_ZONE_RESET_ALL) {
if (op == REQ_OP_ZONE_APPEND) {
/* Force complete completion (no retry) */
good_bytes = 0;
scsi_set_resid(cmd, blk_rq_bytes(rq));
}
/*
* Force an update of the zone write pointer offset on
* the next zone append access.
*/
if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
goto unlock_wp_offset;
}
switch (op) {
case REQ_OP_ZONE_APPEND:
rq->__sector += sdkp->zones_wp_offset[zno];
fallthrough;
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
case REQ_OP_WRITE:
if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
sdkp->zones_wp_offset[zno] +=
good_bytes >> SECTOR_SHIFT;
break;
case REQ_OP_ZONE_RESET:
sdkp->zones_wp_offset[zno] = 0;
break;
case REQ_OP_ZONE_FINISH:
sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
break;
case REQ_OP_ZONE_RESET_ALL:
memset(sdkp->zones_wp_offset, 0,
sdkp->nr_zones * sizeof(unsigned int));
break;
default:
break;
}
unlock_wp_offset:
spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);
return good_bytes;
}
移动指针的函数并不难理解,根据执行的指令进行计算,修改指针写偏移即可。
写到这里,我们可以得出一个zone相关命令的完整流程。
所以,根据以上流程,如果一个设备支持zone且已经被scsi正确适配,要使其处理bio层下发的request,以下回调函数不可或缺:
scsi层的.queuecommand函数,它要能接受zone相关命令,并实际完成读写后向上层返回结果;
对应驱动的.init_command函数,它要能正确地生成低层设备能够处理的命令;
mq_ops中的.complete函数,它要能在低层设备处理完成后,成功移动写指针到对应位置。