Linux5.0 NVMe驱动详细注释

本篇文章非常全面的介绍了基于linux5.0的nvme驱动的所有函数,基本每个函数都有非常详细的注释。同时,本篇文章全部是代码+注释的方式呈现,非常的清晰易懂。希望大家加一个关注,下一篇文章详细讲解nvme驱动中的各个结构体

//一、nvme_probe介绍
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
    int node, result = -ENOMEM;
    struct nvme_dev *dev;
    unsigned long quirks = id->driver_data;
    size_t alloc_size;
    node = dev_to_node(&pdev->dev);
    if (node == NUMA_NO_NODE)
        set_dev_node(&pdev->dev, first_memory_node);
    \\为nvme dev分配空间
    dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
    if (!dev)
        return -ENOMEM;
    \\ IO queues + admin queue空间分配 
    dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
                    GFP_KERNEL, node);\\
    if (!dev->queues)
        goto free;
    dev->dev = get_device(&pdev->dev);// 增加pdev->dev引用计数
    pci_set_drvdata(pdev, dev);
    
    //映射bar空间,初始化dev->bar(nvme寄存器), dev->bar_mapped_size(8192) dev->dbs(doorbell寄存器地址)
    result = nvme_dev_map(dev);
    if (result)
        goto put_pci;
    //初始化队列
    INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
    INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
    mutex_init(&dev->shutdown_lock);
    //创建dma pool 分配256B和4K大小的内存,初始化dev->prp_page_pool和dev->prp_small_pool
    result = nvme_setup_prp_pools(dev);
    if (result)
        goto unmap;
    quirks |= check_vendor_combination_bug(pdev);
    /*
     * Double check that our mempool alloc size will cover the biggest
     * command we support.
     */
    alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
                        NVME_MAX_SEGS, true);
    WARN_ON_ONCE(alloc_size > PAGE_SIZE);
    //初始化dev->iod_mempool内存池,作为内存备用
    dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
                        mempool_kfree,
                        (void *) alloc_size,
                        GFP_KERNEL, node);
    if (!dev->iod_mempool) {
        result = -ENOMEM;
        goto release_pools;
    }
    //初始化nvme controller结构体
    result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
            quirks);
    if (result)
        goto release_mempool;
    dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
    nvme_get_ctrl(&dev->ctrl);
    //调用reset_work,也就是nvme_reset_work
    async_schedule(nvme_async_probe, dev);
    return 0;
 release_mempool:
    mempool_destroy(dev->iod_mempool);
 release_pools:
    nvme_release_prp_pools(dev);
 unmap:
    nvme_dev_unmap(dev);
 put_pci:
    put_device(dev->dev);
 free:
    kfree(dev->queues);
    kfree(dev);
    return result;
}
//二、nvme_reset_work介绍
static void nvme_reset_work(struct work_struct *work)
{
    struct nvme_dev *dev =
        container_of(work, struct nvme_dev, ctrl.reset_work);
    bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
    int result = -ENODEV;
    enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
    if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
        goto out;
    /*
     * If we're called to reset a live controller first shut it down before
     * moving on.
     */
    if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
        nvme_dev_disable(dev, false);
    mutex_lock(&dev->shutdown_lock);
    //只要是设置pci的配置空间寄存器以及NVME控制机的寄存器,下面详解
    result = nvme_pci_enable(dev);
    if (result)
        goto out_unlock;
    //申请admin queue,包含SQ与CQ,下面详解
    result = nvme_pci_configure_admin_queue(dev);
    if (result)
        goto out_unlock;
    result = nvme_alloc_admin_tags(dev);
    if (result)
        goto out_unlock;
    /*
     * Limit the max command size to prevent iod->sg allocations going
     * over a single page.
     */
    dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;//单次可传输的最大扇区数。
    dev->ctrl.max_segments = NVME_MAX_SEGS;//最大分段数
    mutex_unlock(&dev->shutdown_lock);
    /*
     * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
     * initializing procedure here.
     */
     //读取dev->ctrl->state
    if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
        dev_warn(dev->ctrl.device,
            "failed to mark controller CONNECTING\n");
        goto out;
    }
    result = nvme_init_identify(&dev->ctrl);
    if (result)
        goto out;
    if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
        if (!dev->ctrl.opal_dev)
            dev->ctrl.opal_dev =
                init_opal_dev(&dev->ctrl, &nvme_sec_submit);
        else if (was_suspend)
            opal_unlock_from_suspend(dev->ctrl.opal_dev);
    } else {
        free_opal_dev(dev->ctrl.opal_dev);
        dev->ctrl.opal_dev = NULL;
    }
    //如果支持门铃缓冲配置命令(Doorbell Buffer Config command)
    if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
        //为dev->dbbuf_dbs和dev->dbbuf_eis分配内存
        //物理内存存在dev->dbbuf_dbs_dma_addr和dev->dbbuf_eis_dma_addr中
        result = nvme_dbbuf_dma_alloc(dev);
    }
    //主机内存缓冲区首选大小已经设置
    if (dev->ctrl.hmpre) {
        //建立set featrue command(opcode=0x09 fid=0x0d),设置Host Memory Buffer
        //dword11-15参数可参考Figure 330: Host Memory Buffer – Command Dword 11 - Figure 334
        result = nvme_setup_host_mem(dev);
    }
    result = nvme_setup_io_queues(dev);
    if (result)
        goto out;
    /*
     * Keep the controller around but remove all namespaces if we don't have
     * any working I/O queue.
     */
    if (dev->online_queues < 2) {
        dev_warn(dev->ctrl.device, "IO queues not created\n");
        nvme_kill_queues(&dev->ctrl);
        nvme_remove_namespaces(&dev->ctrl);
        new_state = NVME_CTRL_ADMIN_ONLY;
    } else {
        nvme_start_queues(&dev->ctrl);
        nvme_wait_freeze(&dev->ctrl);
        /* hit this only when allocate tagset fails */
        if (nvme_dev_add(dev))
            new_state = NVME_CTRL_ADMIN_ONLY;
        nvme_unfreeze(&dev->ctrl);
    }
    /*
     * If only admin queue live, keep it to do further investigation or
     * recovery.
     */
    if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
        dev_warn(dev->ctrl.device,
            "failed to mark controller state %d\n", new_state);
        goto out;
    }
    nvme_start_ctrl(&dev->ctrl);
    return;
 out_unlock:
    mutex_unlock(&dev->shutdown_lock);
 out:
    nvme_remove_dead_ctrl(dev, result);
}
//2.1 nvme_reset_work-->nvme_pci_enable介绍
static int nvme_pci_enable(struct nvme_dev *dev)
{
    int result = -ENOMEM;
    struct pci_dev *pdev = to_pci_dev(dev->dev);
    //使能nvme设备的内存空间,设置pci配置空间的COMMAND位的bit10 
    //使能COMMAND寄存器的 I / O 和 Memory Space 位之后, 才能访问该设备的存储器或者 I / O 地址空间。
    if (pci_enable_device_mem(pdev))
        return result;
    //设置PCI 设备的配置空间的COMMAND寄存器的bit2(Bus Master位),设置PCI设备为主设备
    pci_set_master(pdev);
    if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
        dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
        goto disable;
    //读nvme的status寄存器
    if (readl(dev->bar + NVME_REG_CSTS) == -1) {
        result = -ENODEV;
        goto disable;
    }
    /*
     * Some devices and/or platforms don't advertise or work with INTx
     * interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
     * adjust this later.
     */
     //设置IRQ
    result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
    if (result < 0)
        return result;
    //读取nvme控制器的CAP寄存器
    dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
    //读取nvme控制器的CAP寄存器的MQES位(该位表示控制器支持的最大单个队列长度),
    //io_queue_depth取MQES与io_queue_depth较小值
    dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
                io_queue_depth);
    //读取CAP寄存器的DSTRD(Doorbell stride)位,该位表示doorbell寄存器之间的间隔,
    //间隔为2^(2+DSTRD)
    dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
    //doorbell寄存器的起始地址,即SQ0TDBL的地址
    dev->dbs = dev->bar + 4096;
 
    nvme_map_cmb(dev);
    pci_enable_pcie_error_reporting(pdev);
    pci_save_state(pdev);
    return 0;
 disable:
    pci_disable_device(pdev);
    return result;
}
//2.2 nvme_reset_work-->nvme_pci_configure_admin_queue介绍
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
{
    int result;
    u32 aqa;
    struct nvme_queue *nvmeq;
    //将pci bar通过ioremap到虚拟地址空间
    //初始化dev->bar,dev->bar_mapped_size(8192),dev->dbs寄存器的地址
    result = nvme_remap_bar(dev, db_bar_size(dev, 0));
    if (result < 0)
        return result;
    //读取nvme控制器的VS寄存器,获取版本,如果>v1.1.0,d读取状态寄存器的NSSRO位赋给dev->subsystem
    dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
                NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
    if (dev->subsystem &&
        (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
        //写控制器状态寄存器NSSRO位为1,支持nvm subsystem reset
        writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
    //将NVME的CC寄存器EN(清0时,控制器不在处理命令)位以及SHN位清0,等待STATUS RDY位清0
    result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
    if (result < 0)
        return result;
    //1.使用dma_alloc_coherent分配Complete Queue,虚拟地址保存到nvmeq->cqes,物理地址保存到nvmeq->cq_dma_addr
    //2.调用nvme_alloc_sq_cmds-->dma_alloc_coherent来处理submission queue(qid为0 ,无法使用pci_alloc_p2pmem 分配内存)
    //  虚拟地址nvmeq->sq_cmds,物理地址保存到nvmeq->sq_dma_addr
    //3. 初始化dev->queues[0]结构体
    result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
    if (result)
        return result;
    nvmeq = &dev->queues[0];
    aqa = nvmeq->q_depth - 1;
    aqa |= aqa << 16;
    //设置NVME控制寄存器的AQA寄存器,将q_depth写入该寄存器的ACQS位(定义管理完成队列大小)
    //和ASQS位(定义管理提交队列的大小)
    writel(aqa, dev->bar + NVME_REG_AQA);
    //将上面分配的SQ物理地址写入ASQ寄存器(该寄存器保存提交队列的内存基地址)中
    lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
    //将上面分配的CQ物理地址写入ASQ寄存器(该寄存器保存完成队列的内存基地址)中
    lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
    //设置NVMe控制器的CC寄存器,等待CSTS寄存器的RDY
    result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
    if (result)
        return result;
    nvmeq->cq_vector = 0;
    //1. 初始化dev->queues[0]一些队列相关的结构体
    //2. 使用memeset将cqes队列清0
    //3. 调用 nvme_dbbuf_init 函数来初始化设备的 Doorbell 缓冲区。
    nvme_init_queue(nvmeq, 0);
    //请求分配一个队列的中断,nvmeq->cq_vector保存终端向量号,nvme_irq为中断处理函数
    //nvmeq为传递给中断处理函数的参数
    result = queue_request_irq(nvmeq);
    if (result) {
        nvmeq->cq_vector = -1;
        return result;
    }
    //将nvmeq->flags的第0位置1,
    set_bit(NVMEQ_ENABLED, &nvmeq->flags);
    return result;
}
//2.3 nvme_reset_work --> nvme_alloc_admin_tags详解
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
    if (!dev->ctrl.admin_q) {
        //struct blk_mq_tag_set结构体主要是包含了块设备的硬件配置信息
        //设置操作函数
        dev->admin_tagset.ops = &nvme_mq_admin_ops;
        //队列个数
        dev->admin_tagset.nr_hw_queues = 1;
        //队列深度
        dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
        //admin command超时时间
        dev->admin_tagset.timeout = ADMIN_TIMEOUT;
        dev->admin_tagset.numa_node = dev_to_node(dev->dev);
        //cmd长度
        dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
        dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
        dev->admin_tagset.driver_data = dev;
        //1. 硬件调度队列数>cpu数时,硬件调度队列数=cpu数
        //2. 给blk_mq_tag_set*set->tags[]分配nr_cpu个blk_mq_tags指针,
        //3. 给dev->admin_tagset->map[i].mq_map分配nr_cpu个,用于软硬队列映射,下标为cpu的编号,数组成员是硬件队列编号
        //4. 调用blk_mq_map_queues在 CPU 和队列之间进行顺序映射。
        //5. 调用blk_mq_alloc_rq_maps对每个硬件队列,根据队列深度来分配tag bitmap和request,
        //    分配的request指针最终保存到tags->static_rqs[i]
        if (blk_mq_alloc_tag_set(&dev->admin_tagset))
            return -ENOMEM;
        dev->ctrl.admin_tagset = &dev->admin_tagset;
        //1. 分配request queue,初始化struct request_queue(dev->ctrl.admin_q)
        //2. 分配软件队列与硬件队列,初始化并建立二者的联系
        dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
        if (IS_ERR(dev->ctrl.admin_q)) {
            blk_mq_free_tag_set(&dev->admin_tagset);
            return -ENOMEM;
        }
        if (!blk_get_queue(dev->ctrl.admin_q)) {
            nvme_dev_remove_admin(dev);
            dev->ctrl.admin_q = NULL;
            return -ENODEV;
        }
    } else
        blk_mq_unquiesce_queue(dev->ctrl.admin_q);
    return 0;
}
//2.4 nvme_reset_work --> nvme_init_identify详解
int nvme_init_identify(struct nvme_ctrl *ctrl)
{
    struct nvme_id_ctrl *id;
    u64 cap;
    int ret, page_shift;
    u32 max_hw_sectors;
    bool prev_apst_enabled;
    //读取版本号
    ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
    if (ret) {
        dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
        return ret;
    }
    //获取cap寄存器的值
    ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
    if (ret) {
        dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
        return ret;
    }
    //获取nvme控制寄存器的cap寄存器的MPSMIN位(最小内存页大小),最小内存页为2^(12 + MPSMIN)
    page_shift = NVME_CAP_MPSMIN(cap) + 12;
    //nvmeV1.1.0以上的版本支持子系统
    if (ctrl->vs >= NVME_VS(1, 1, 0))
        ctrl->subsystem = NVME_CAP_NSSRC(cap);
    //建立identify command(opcode=0x06 cns=1),并submit执行,下发后,返回4KB的– Identify Controller Data Structure
    //返回值保存在id中
    //id结构体的描述位于NVM-Express-base-specification-2.0c-2022.10.04:Figure 275: Identify – Identify Controller Data Structure, I/O Command Set Independent P258
    ret = nvme_identify_ctrl(ctrl, &id);
    if (ret) {
        dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
        return -EIO;
    }
    //lpa的bit1如果为1,表示支持Commands Supported and Effects log page
    if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
        //建立get log page 命令(opcode=0x2),返回的结果保存在ctrl->effects
        //注:返回值描述位于NVM-Express-base-specification-2.0c-2022.10.04:Figure 210: Commands Supported and Effects Log Page P200
        ret = nvme_get_effects_log(ctrl);
        if (ret < 0)
            goto out_free;
    }
    //identified用于标识控制器是否已被识别,未被识别,进入初始化
    if (!ctrl->identified) {
        int i;
        //获取的id值来初始化nvme子系统
        ret = nvme_init_subsystem(ctrl, id);
        if (ret)
            goto out_free;
        for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
            if (quirk_matches(id, &core_quirks[i]))
                ctrl->quirks |= core_quirks[i].quirks;
        }
    }
    if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
        dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
        ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
    }
    //用获取的id数据初始化控制器请求超时数据,参数详见Figure 92: Completion Queue Entry: Status Field
    //当CQE CRD字段为1时,命令重发延时选用crd1,为2时,用crd2...
    ctrl->crdt[0] = le16_to_cpu(id->crdt1);
    ctrl->crdt[1] = le16_to_cpu(id->crdt2);
    ctrl->crdt[2] = le16_to_cpu(id->crdt3);
    //支持的命令集
    ctrl->oacs = le16_to_cpu(id->oacs);
    //可选命令集
    ctrl->oncs = le16_to_cpup(&id->oncs);
    //可选异步时间支持
    ctrl->oaes = le32_to_cpu(id->oaes);
    //abort命令的最大个数
    atomic_set(&ctrl->abort_limit, id->acl + 1);
    //写写入缓存开关
    ctrl->vwc = id->vwc;
    //最大传输数据大小
    if (id->mdts)
        max_hw_sectors = 1 << (id->mdts + page_shift - 9);
    else
        max_hw_sectors = UINT_MAX;
    //单次最大可传输的扇区数
    ctrl->max_hw_sectors =
        min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
    //设置request queue参数
    nvme_set_queue_limits(ctrl, ctrl->admin_q);
    //是否支持sgl
    ctrl->sgls = le32_to_cpu(id->sgls);
    ctrl->kas = le16_to_cpu(id->kas);
    //最大命名空间数
    ctrl->max_namespaces = le32_to_cpu(id->mnan);
    //控制器属性
    ctrl->ctratt = le32_to_cpu(id->ctratt);
    //进入D3的延迟
    if (id->rtd3e) {
        /* us -> s */
        u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;//换算为秒
        ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
                         shutdown_timeout, 60);
        if (ctrl->shutdown_timeout != shutdown_timeout)
            dev_info(ctrl->device,
                 "Shutdown timeout set to %u seconds\n",
                 ctrl->shutdown_timeout);
    } else
        ctrl->shutdown_timeout = shutdown_timeout;
    ctrl->npss = id->npss;// NVMe 控制器支持的不同电源状态的数量
    ctrl->apsta = id->apsta;//用于控制控制器是否允许自主进行电源状态的转换
    prev_apst_enabled = ctrl->apst_enabled;//APST(省电状态传输)是否已启用
    if (ctrl->quirks & NVME_QUIRK_NO_APST) {
        if (force_apst && id->apsta) {
            dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
            ctrl->apst_enabled = true;
        } else {
            ctrl->apst_enabled = false;
        }
    } else {
        ctrl->apst_enabled = id->apsta;
    }
    memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));//电源状态描述符
    if (ctrl->ops->flags & NVME_F_FABRICS) {//fabrics设备
        ctrl->icdoff = le16_to_cpu(id->icdoff);
        ctrl->ioccsz = le32_to_cpu(id->ioccsz);
        ctrl->iorcsz = le32_to_cpu(id->iorcsz);
        ctrl->maxcmd = le16_to_cpu(id->maxcmd);
        /*
         * In fabrics we need to verify the cntlid matches the
         * admin connect
         */
        if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
            ret = -EINVAL;
            goto out_free;
        }
        if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
            dev_err(ctrl->device,
                "keep-alive support is mandatory for fabrics\n");
            ret = -EINVAL;
            goto out_free;
        }
    } else {
        ctrl->cntlid = le16_to_cpu(id->cntlid);//控制器ID
        ctrl->hmpre = le32_to_cpu(id->hmpre);//主机内存缓冲区大小,以4kib为单位
        ctrl->hmmin = le32_to_cpu(id->hmmin);主机内存缓冲区最小大小,以4kib为单位
        ctrl->hmminds = le32_to_cpu(id->hmminds);//最小数据传输速率
        ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);//最大数据传输速率
    }
    //初始化与多路径相关的控制结构
    ret = nvme_mpath_init(ctrl, id);
    kfree(id);
    if (ret < 0)
        return ret;
    if (ctrl->apst_enabled && !prev_apst_enabled)
        dev_pm_qos_expose_latency_tolerance(ctrl->device);
    else if (!ctrl->apst_enabled && prev_apst_enabled)
        dev_pm_qos_hide_latency_tolerance(ctrl->device);
    //建立set featrue command(opcode=0x09 fid=0x0c),设置Autonomous Power State Transition
    //参见Figure 327: Autonomous Power State Transition – Command Dword 11
    ret = nvme_configure_apst(ctrl);
    if (ret < 0)
        return ret;
    //与上类似,fid不同
    ret = nvme_configure_timestamp(ctrl);
    if (ret < 0)
        return ret;
    ret = nvme_configure_directives(ctrl);
    if (ret < 0)
        return ret;
    ret = nvme_configure_acre(ctrl);
    if (ret < 0)
        return ret;
    ctrl->identified = true;
    return 0;
out_free:
    kfree(id);
    return ret;
}
//2.5 nvme_reset_work --> nvme_setup_io_queues详解
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
    struct nvme_queue *adminq = &dev->queues[0];
    struct pci_dev *pdev = to_pci_dev(dev->dev);
    int result, nr_io_queues;
    unsigned long size;
    nr_io_queues = max_io_queues();
    //发送set feature cmd设置IO queues数目
    result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
    if (result < 0)
        return result;
    if (nr_io_queues == 0)
        return 0;
    
    clear_bit(NVMEQ_ENABLED, &adminq->flags);
    //支持cmb
    if (dev->cmb_use_sqes) {
        result = nvme_cmb_qdepth(dev, nr_io_queues,
                sizeof(struct nvme_command));
        if (result > 0)
            dev->q_depth = result;
        else
            dev->cmb_use_sqes = false;
    }
    do {
         // 计算所需的bar size
        size = db_bar_size(dev, nr_io_queues);
        //映射bar空间
        result = nvme_remap_bar(dev, size);
        if (!result)
            break;
        if (!--nr_io_queues)
            return -ENOMEM;
    } while (1);
    adminq->q_db = dev->dbs;
 retry:
    /* Deregister the admin queue's interrupt */
    pci_free_irq(pdev, 0, adminq);
    /*
     * If we enable msix early due to not intx, disable it again before
     * setting up the full range we need.
     */
    pci_free_irq_vectors(pdev);
    result = nvme_setup_irqs(dev, nr_io_queues);
    if (result <= 0)
        return -EIO;
    dev->num_vecs = result;
    result = max(result - 1, 1);
    dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
    /*
     * Should investigate if there's a performance win from allocating
     * more queues than interrupt vectors; it might allow the submission
     * path to scale better, even if the receive path is limited by the
     * number of interrupts.
     */
     //申请irq,中断处理函数是nvme_irq
    result = queue_request_irq(adminq);
    if (result) {
        adminq->cq_vector = -1;
        return result;
    }
    set_bit(NVMEQ_ENABLED, &adminq->flags);
    //详见2.5.1
    result = nvme_create_io_queues(dev);
    if (result || dev->online_queues < 2)
        return result;
    if (dev->online_queues - 1 < dev->max_qid) {
        nr_io_queues = dev->online_queues - 1;
        nvme_disable_io_queues(dev);
        nvme_suspend_io_queues(dev);
        goto retry;
    }
    dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
                    dev->io_queues[HCTX_TYPE_DEFAULT],
                    dev->io_queues[HCTX_TYPE_READ],
                    dev->io_queues[HCTX_TYPE_POLL]);
    return 0;
}
//2.5.1 nvme_reset_work --> nvme_setup_io_queues -->nvme_create_io_queues详解
static int nvme_create_io_queues(struct nvme_dev *dev)
{
    unsigned i, max, rw_queues;
    int ret = 0;
    for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
        //分配nvmeq结构体,并记录到dev->queues[]数组中,并分配   IO submit queue 和IO complete queue所需要的空间
        //dev->queues[qid]->cqes记录虚拟地址和dev->queues[qid]->cq_dma_addr记录物理地址,
        //初始化dev->queues[qid]结构体
        if (nvme_alloc_queue(dev, i, dev->q_depth)) {
            ret = -ENOMEM;
            break;
        }
    }
    max = min(dev->max_qid, dev->ctrl.queue_count - 1);
    if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
        rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
                dev->io_queues[HCTX_TYPE_READ];
    } else {
        rw_queues = max;
    }
    for (i = dev->online_queues; i <= max; i++) {
        bool polled = i > rw_queues;
        //详见2.5.1.1
        ret = nvme_create_queue(&dev->queues[i], i, polled);
        if (ret)
            break;
    }
    return ret >= 0 ? 0 : ret;
}
//2.5.1.1 nvme_reset_work --> nvme_setup_io_queues -->nvme_create_io_queues --> nvme_create_queue详解
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
    struct nvme_dev *dev = nvmeq->dev;
    int result;
    s16 vector;
    clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
    /*
     * A queue's vector matches the queue identifier unless the controller
     * has only one vector available.
     */
    if (!polled)
        vector = dev->num_vecs == 1 ? 0 : qid;
    else
        vector = -1;
    //发送create cq cmd创建CQ队列
    //详见NVM-Express-Base-Specification 5.4 Create I/O Completion Queue command
    result = adapter_alloc_cq(dev, qid, nvmeq, vector);
    if (result)
        return result;
    //发送create sq cmd创建SQ队列
    //详见NVM-Express-Base-Specification. 5.5 Create I/O Submission Queue command
    result = adapter_alloc_sq(dev, qid, nvmeq);
    if (result < 0)
        return result;
    else if (result)
        goto release_cq;
    nvmeq->cq_vector = vector;
    //1. 初始化dev->queues[qid]一些队列相关的结构体
    //2. 使用memeset将cqes队列清0
    //3. 调用 nvme_dbbuf_init 函数来初始化设备的 Doorbell 缓冲区。
    nvme_init_queue(nvmeq, qid);
    if (vector != -1) {
        result = queue_request_irq(nvmeq);
        if (result < 0)
            goto release_sq;
    }
    set_bit(NVMEQ_ENABLED, &nvmeq->flags);
    return result;
release_sq:
    nvmeq->cq_vector = -1;
    dev->online_queues--;
    adapter_delete_sq(dev, qid);
release_cq:
    adapter_delete_cq(dev, qid);
    return result;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值