Linux kernel | scsi层是如何处理zone相关request的

上文中,我们将提交了submit_bio后的流程进行了分析,经过梳理之后,一个submit_bio在经过层层转化,最后进入到硬件派发队列hctx时,会调用硬件绑定的驱动所提供的queue_rq函数,这个函数是每一个硬件派发队列对应驱动都必须提供的。

outside_default.png

我们之前所研究的virtio部分是挂在pci总线上面的,然而在多数情况下,硬盘是通过scsi子系统被发现并管理的,磁盘驱动为scsi子系统中的一种高级驱动sd。

因此,根据我们现有的知识,在mq体系下,scsi一定向上层提供了对应的硬件派发队列以及对应的queue_rq函数。

我们通过简单地搜索,就能找到scsi所绑定的blk_mq_op:

//common/drivers/scsi/scsi_lib.c
static const struct blk_mq_ops scsi_mq_ops = {
  .get_budget  = scsi_mq_get_budget,
  .put_budget  = scsi_mq_put_budget,
  .queue_rq  = scsi_queue_rq,
  .commit_rqs  = scsi_commit_rqs,
  .complete  = scsi_complete,
  .timeout  = scsi_timeout,
#ifdef CONFIG_BLK_DEBUG_FS
  .show_rq  = scsi_show_rq,
#endif
  .init_request  = scsi_mq_init_request,
  .exit_request  = scsi_mq_exit_request,
  .cleanup_rq  = scsi_cleanup_rq,
  .busy    = scsi_mq_lld_busy,
  .map_queues  = scsi_map_queues,
  .init_hctx  = scsi_init_hctx,
  .poll    = scsi_mq_poll,
  .set_rq_budget_token = scsi_mq_set_rq_budget_token,
  .get_rq_budget_token = scsi_mq_get_rq_budget_token,
};

找到scsi_queue_rq函数:

//common/drivers/scsi/scsi_lib.c
static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
       const struct blk_mq_queue_data *bd)
{
  struct request *req = bd->rq;
  struct request_queue *q = req->q;
  struct scsi_device *sdev = q->queuedata;
  struct Scsi_Host *shost = sdev->host;
  struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
  blk_status_t ret;
  int reason;


  WARN_ON_ONCE(cmd->budget_token < 0);


  /*
   * If the device is not in running state we will reject some or all
   * commands.
   */
  if (unlikely(sdev->sdev_state != SDEV_RUNNING)) {
    ret = scsi_device_state_check(sdev, req);
    if (ret != BLK_STS_OK)
      goto out_put_budget;
  }


  ret = BLK_STS_RESOURCE;
  if (!scsi_target_queue_ready(shost, sdev))
    goto out_put_budget;
  if (unlikely(scsi_host_in_recovery(shost))) {
    if (cmd->flags & SCMD_FAIL_IF_RECOVERING)
      ret = BLK_STS_OFFLINE;
    goto out_dec_target_busy;
  }
  if (!scsi_host_queue_ready(q, shost, sdev, cmd))
    goto out_dec_target_busy;


  if (!(req->rq_flags & RQF_DONTPREP)) {
    ret = scsi_prepare_cmd(req);
    if (ret != BLK_STS_OK)
      goto out_dec_host_busy;
    req->rq_flags |= RQF_DONTPREP;
  } else {
    clear_bit(SCMD_STATE_COMPLETE, &cmd->state);
  }


  cmd->flags &= SCMD_PRESERVED_FLAGS;
  if (sdev->simple_tags)
    cmd->flags |= SCMD_TAGGED;
  if (bd->last)
    cmd->flags |= SCMD_LAST;


  scsi_set_resid(cmd, 0);
  memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
  cmd->submitter = SUBMITTED_BY_BLOCK_LAYER;


  blk_mq_start_request(req);
  reason = scsi_dispatch_cmd(cmd);
  if (reason) {
    scsi_set_blocked(cmd, reason);
    ret = BLK_STS_RESOURCE;
    goto out_dec_host_busy;
  }


  atomic_inc(&cmd->device->iorequest_cnt);
  return BLK_STS_OK;


out_dec_host_busy:
  scsi_dec_host_busy(shost, cmd);
out_dec_target_busy:
  if (scsi_target(sdev)->can_queue > 0)
    atomic_dec(&scsi_target(sdev)->target_busy);
out_put_budget:
  scsi_mq_put_budget(q, cmd->budget_token);
  cmd->budget_token = -1;
  switch (ret) {
  case BLK_STS_OK:
    break;
  case BLK_STS_RESOURCE:
  case BLK_STS_ZONE_RESOURCE:
    if (scsi_device_blocked(sdev))
      ret = BLK_STS_DEV_RESOURCE;
    break;
  case BLK_STS_AGAIN:
    cmd->result = DID_BUS_BUSY << 16;
    if (req->rq_flags & RQF_DONTPREP)
      scsi_mq_uninit_cmd(cmd);
    break;
  default:
    if (unlikely(!scsi_device_online(sdev)))
      cmd->result = DID_NO_CONNECT << 16;
    else
      cmd->result = DID_ERROR << 16;
    /*
     * Make sure to release all allocated resources when
     * we hit an error, as we will never see this command
     * again.
     */
    if (req->rq_flags & RQF_DONTPREP)
      scsi_mq_uninit_cmd(cmd);
    scsi_run_queue_async(sdev);
    break;
  }
  return ret;
}

这里面虽然没有和zone有关的内容,但是不要慌张,浅浅地回忆一下,进入到这个函数之后大致要完成的工作是,把队列中的request再转化为对硬件的command,接着下发command到硬件,完成io。

也就是说,对于request的解析,一定是在command生成之前的。

在上面代码的35行之前,是在做一些必要的检查,确保队列、硬件处于正常工作的状态,接着37行,出现一个关键的函数scsi_prepare_cmd,顾名思义,command可能会在这个函数中进行初始化。

所以我们进入到这个函数:

//common/drivers/scsi/scsi_lib.c
static blk_status_t scsi_prepare_cmd(struct request *req)
{
  struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
  struct scsi_device *sdev = req->q->queuedata;
  struct Scsi_Host *shost = sdev->host;
  bool in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state);
  struct scatterlist *sg;


  scsi_init_command(sdev, cmd);


  cmd->eh_eflags = 0;
  cmd->prot_type = 0;
  cmd->prot_flags = 0;
  cmd->submitter = 0;
  memset(&cmd->sdb, 0, sizeof(cmd->sdb));
  cmd->underflow = 0;
  cmd->transfersize = 0;
  cmd->host_scribble = NULL;
  cmd->result = 0;
  cmd->extra_len = 0;
  cmd->state = 0;
  if (in_flight)
    __set_bit(SCMD_STATE_INFLIGHT, &cmd->state);


  /*
   * Only clear the driver-private command data if the LLD does not supply
   * a function to initialize that data.
   */
  if (!shost->hostt->init_cmd_priv)
    memset(cmd + 1, 0, shost->hostt->cmd_size);


  cmd->prot_op = SCSI_PROT_NORMAL;
  if (blk_rq_bytes(req))
    cmd->sc_data_direction = rq_dma_dir(req);
  else
    cmd->sc_data_direction = DMA_NONE;


  sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size;
  cmd->sdb.table.sgl = sg;


  if (scsi_host_get_prot(shost)) {
    memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer));


    cmd->prot_sdb->table.sgl =
      (struct scatterlist *)(cmd->prot_sdb + 1);
  }


  /*
   * Special handling for passthrough commands, which don't go to the ULP
   * at all:
   */
  if (blk_rq_is_passthrough(req))
    return scsi_setup_scsi_cmnd(sdev, req);


  if (sdev->handler && sdev->handler->prep_fn) {
    blk_status_t ret = sdev->handler->prep_fn(sdev, req);


    if (ret != BLK_STS_OK)
      return ret;
  }


  /* Usually overridden by the ULP */
  cmd->allowed = 0;
  memset(cmd->cmnd, 0, sizeof(cmd->cmnd));
  return scsi_cmd_to_driver(cmd)->init_command(cmd);
}

要知道scsi层里面,高级驱动可不止sd一个,因此,我们可以猜测这个函数只是在做一些通用性的命令初始化,对于特异性的初始化,一定会转交sd驱动处理,所以直接看代码的66行,调用了对应cmd绑定驱动的init_command函数。

让我们看看sd是如何处理的:

//common/drivers/scsi/sd.c
static struct scsi_driver sd_template = {
  .gendrv = {
    .name    = "sd",
    .owner    = THIS_MODULE,
    .probe    = sd_probe,
    .probe_type  = PROBE_PREFER_ASYNCHRONOUS,
    .remove    = sd_remove,
    .shutdown  = sd_shutdown,
    .pm    = &sd_pm_ops,
  },
  .rescan      = sd_rescan,
  .init_command    = sd_init_command,
  .uninit_command    = sd_uninit_command,
  .done      = sd_done,
  .eh_action    = sd_eh_action,
  .eh_reset    = sd_eh_reset,
};

上面的结构体描述了一个scsi_driver的一系列回调函数,关于scsi我们之后再系统的学习,这里我们先去了解它的init_command函数:

//common/drivers/scsi/sd.c
static blk_status_t sd_init_command(struct scsi_cmnd *cmd)
{
  struct request *rq = scsi_cmd_to_rq(cmd);


  switch (req_op(rq)) {
  case REQ_OP_DISCARD:
    switch (scsi_disk(rq->q->disk)->provisioning_mode) {
    case SD_LBP_UNMAP:
      return sd_setup_unmap_cmnd(cmd);
    case SD_LBP_WS16:
      return sd_setup_write_same16_cmnd(cmd, true);
    case SD_LBP_WS10:
      return sd_setup_write_same10_cmnd(cmd, true);
    case SD_LBP_ZERO:
      return sd_setup_write_same10_cmnd(cmd, false);
    default:
      return BLK_STS_TARGET;
    }
  case REQ_OP_WRITE_ZEROES:
    return sd_setup_write_zeroes_cmnd(cmd);
  case REQ_OP_FLUSH:
    return sd_setup_flush_cmnd(cmd);
  case REQ_OP_READ:
  case REQ_OP_WRITE:
  case REQ_OP_ZONE_APPEND:
    return sd_setup_read_write_cmnd(cmd);
  case REQ_OP_ZONE_RESET:
    return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
               false);
  case REQ_OP_ZONE_RESET_ALL:
    return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_RESET_WRITE_POINTER,
               true);
  case REQ_OP_ZONE_OPEN:
    return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_OPEN_ZONE, false);
  case REQ_OP_ZONE_CLOSE:
    return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_CLOSE_ZONE, false);
  case REQ_OP_ZONE_FINISH:
    return sd_zbc_setup_zone_mgmt_cmnd(cmd, ZO_FINISH_ZONE, false);
  default:
    WARN_ON_ONCE(1);
    return BLK_STS_NOTSUPP;
  }
}

到这里之后,完成了将request转化为command的操作,在此处REQ_OP的作用是将不同的操作正确生成了Command命令。

outside_default.png

之后,系统会在scsi_dispatch_cmd函数中,将准备好的command下发到低层驱动。

static int scsi_dispatch_cmd(struct scsi_cmnd *cmd)
{
  struct Scsi_Host *host = cmd->device->host;
  int rtn = 0;


  atomic_inc(&cmd->device->iorequest_cnt);


  /* check if the device is still usable */
  if (unlikely(cmd->device->sdev_state == SDEV_DEL)) {
    /* in SDEV_DEL we error all commands. DID_NO_CONNECT
     * returns an immediate error upwards, and signals
     * that the device is no longer present */
    cmd->result = DID_NO_CONNECT << 16;
    goto done;
  }


  /* Check to see if the scsi lld made this device blocked. */
  if (unlikely(scsi_device_blocked(cmd->device))) {
    /*
     * in blocked state, the command is just put back on
     * the device queue.  The suspend state has already
     * blocked the queue so future requests should not
     * occur until the device transitions out of the
     * suspend state.
     */
    SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
      "queuecommand : device blocked\n"));
    return SCSI_MLQUEUE_DEVICE_BUSY;
  }


  /* Store the LUN value in cmnd, if needed. */
  if (cmd->device->lun_in_cdb)
    cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) |
             (cmd->device->lun << 5 & 0xe0);


  scsi_log_send(cmd);


  /*
   * Before we queue this command, check if the command
   * length exceeds what the host adapter can handle.
   */
  if (cmd->cmd_len > cmd->device->host->max_cmd_len) {
    SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
             "queuecommand : command too long. "
             "cdb_size=%d host->max_cmd_len=%d\n",
             cmd->cmd_len, cmd->device->host->max_cmd_len));
    cmd->result = (DID_ABORT << 16);
    goto done;
  }


  if (unlikely(host->shost_state == SHOST_DEL)) {
    cmd->result = (DID_NO_CONNECT << 16);
    goto done;


  }


  trace_scsi_dispatch_cmd_start(cmd);
  rtn = host->hostt->queuecommand(host, cmd);
  if (rtn) {
    trace_scsi_dispatch_cmd_error(cmd, rtn);
    if (rtn != SCSI_MLQUEUE_DEVICE_BUSY &&
        rtn != SCSI_MLQUEUE_TARGET_BUSY)
      rtn = SCSI_MLQUEUE_HOST_BUSY;


    SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd,
      "queuecommand : request rejected\n"));
  }


  return rtn;
 done:
  cmd->scsi_done(cmd);
  return 0;
}

queuecommand是低层驱动中一个最关键的函数,将在之后学习scsi的时候再细说,在这个函数中,cmd->scsi_done函数会在所有操作完成后返回前被调用,向上层报告结果。这个函数则是在scsi_queue_rq函数中被定义为scsi_mq_done。

cmd->scsi_done = scsi_mq_done;
//common/drivers/scsi/scsi_lib.c
static void scsi_mq_done(struct scsi_cmnd *cmd)
{
  if (unlikely(blk_should_fake_timeout(scsi_cmd_to_rq(cmd)->q)))
    return;
  if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state)))
    return;
  trace_scsi_dispatch_cmd_done(cmd);
  blk_mq_complete_request(scsi_cmd_to_rq(cmd));
}
//common/block/blk-mq.c
void blk_mq_complete_request(struct request *rq)
{
  if (!blk_mq_complete_request_remote(rq))
    rq->q->mq_ops->complete(rq);
}

最终调用了mq_ops里的complete函数。

sd_zbc.c里同样提供了相关实现:

//common/drivers/scsi/sd_zbc.c
unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes,
         struct scsi_sense_hdr *sshdr)
{
  int result = cmd->result;
  struct request *rq = scsi_cmd_to_rq(cmd);


  if (op_is_zone_mgmt(req_op(rq)) &&
      result &&
      sshdr->sense_key == ILLEGAL_REQUEST &&
      sshdr->asc == 0x24) {
    /*
     * INVALID FIELD IN CDB error: a zone management command was
     * attempted on a conventional zone. Nothing to worry about,
     * so be quiet about the error. 允许区块管理命令操作传统区块
     */
    rq->rq_flags |= RQF_QUIET;
  } else if (sd_zbc_need_zone_wp_update(rq))
    good_bytes = sd_zbc_zone_wp_update(cmd, good_bytes);
    // 更新块指针
  if (req_op(rq) == REQ_OP_ZONE_APPEND)
    blk_req_zone_write_unlock(rq);
    // 对于写追加操作的特殊处理
  return good_bytes;
}

完成函数中,根据下层返回的结果,移动写指针到正确的位置上。

static unsigned int sd_zbc_zone_wp_update(struct scsi_cmnd *cmd,
            unsigned int good_bytes)
{
  int result = cmd->result;
  struct request *rq = scsi_cmd_to_rq(cmd);
  struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
  unsigned int zno = blk_rq_zone_no(rq);
  enum req_opf op = req_op(rq);
  unsigned long flags;


  /*
   * If we got an error for a command that needs updating the write
   * pointer offset cache, we must mark the zone wp offset entry as
   * invalid to force an update from disk the next time a zone append
   * command is issued.
   */
  spin_lock_irqsave(&sdkp->zones_wp_offset_lock, flags);


  if (result && op != REQ_OP_ZONE_RESET_ALL) {
    if (op == REQ_OP_ZONE_APPEND) {
      /* Force complete completion (no retry) */
      good_bytes = 0;
      scsi_set_resid(cmd, blk_rq_bytes(rq));
    }


    /*
     * Force an update of the zone write pointer offset on
     * the next zone append access.
     */
    if (sdkp->zones_wp_offset[zno] != SD_ZBC_UPDATING_WP_OFST)
      sdkp->zones_wp_offset[zno] = SD_ZBC_INVALID_WP_OFST;
    goto unlock_wp_offset;
  }


  switch (op) {
  case REQ_OP_ZONE_APPEND:
    rq->__sector += sdkp->zones_wp_offset[zno];
    fallthrough;
  case REQ_OP_WRITE_ZEROES:
  case REQ_OP_WRITE_SAME:
  case REQ_OP_WRITE:
    if (sdkp->zones_wp_offset[zno] < sd_zbc_zone_sectors(sdkp))
      sdkp->zones_wp_offset[zno] +=
            good_bytes >> SECTOR_SHIFT;
    break;
  case REQ_OP_ZONE_RESET:
    sdkp->zones_wp_offset[zno] = 0;
    break;
  case REQ_OP_ZONE_FINISH:
    sdkp->zones_wp_offset[zno] = sd_zbc_zone_sectors(sdkp);
    break;
  case REQ_OP_ZONE_RESET_ALL:
    memset(sdkp->zones_wp_offset, 0,
           sdkp->nr_zones * sizeof(unsigned int));
    break;
  default:
    break;
  }


unlock_wp_offset:
  spin_unlock_irqrestore(&sdkp->zones_wp_offset_lock, flags);


  return good_bytes;
}

移动指针的函数并不难理解,根据执行的指令进行计算,修改指针写偏移即可。

写到这里,我们可以得出一个zone相关命令的完整流程。

outside_default.png

所以,根据以上流程,如果一个设备支持zone且已经被scsi正确适配,要使其处理bio层下发的request,以下回调函数不可或缺:

  1. scsi层的.queuecommand函数,它要能接受zone相关命令,并实际完成读写后向上层返回结果;

  2. 对应驱动的.init_command函数,它要能正确地生成低层设备能够处理的命令;

  3. mq_ops中的.complete函数,它要能在低层设备处理完成后,成功移动写指针到对应位置。

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值