本篇文章非常全面的介绍了基于linux5.0的nvme驱动的所有函数,基本每个函数都有非常详细的注释。同时,本篇文章全部是代码+注释的方式呈现,非常的清晰易懂。希望大家加一个关注,下一篇文章详细讲解nvme驱动中的各个结构体
//一、nvme_probe介绍
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
struct nvme_dev *dev;
unsigned long quirks = id->driver_data;
size_t alloc_size;
node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
\\为nvme dev分配空间
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
\\ IO queues + admin queue空间分配
dev->queues = kcalloc_node(max_queue_count(), sizeof(struct nvme_queue),
GFP_KERNEL, node);\\
if (!dev->queues)
goto free;
dev->dev = get_device(&pdev->dev);// 增加pdev->dev引用计数
pci_set_drvdata(pdev, dev);
//映射bar空间,初始化dev->bar(nvme寄存器), dev->bar_mapped_size(8192) dev->dbs(doorbell寄存器地址)
result = nvme_dev_map(dev);
if (result)
goto put_pci;
//初始化队列
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
mutex_init(&dev->shutdown_lock);
//创建dma pool 分配256B和4K大小的内存,初始化dev->prp_page_pool和dev->prp_small_pool
result = nvme_setup_prp_pools(dev);
if (result)
goto unmap;
quirks |= check_vendor_combination_bug(pdev);
/*
* Double check that our mempool alloc size will cover the biggest
* command we support.
*/
alloc_size = nvme_pci_iod_alloc_size(dev, NVME_MAX_KB_SZ,
NVME_MAX_SEGS, true);
WARN_ON_ONCE(alloc_size > PAGE_SIZE);
//初始化dev->iod_mempool内存池,作为内存备用
dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
mempool_kfree,
(void *) alloc_size,
GFP_KERNEL, node);
if (!dev->iod_mempool) {
result = -ENOMEM;
goto release_pools;
}
//初始化nvme controller结构体
result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
quirks);
if (result)
goto release_mempool;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
nvme_get_ctrl(&dev->ctrl);
//调用reset_work,也就是nvme_reset_work
async_schedule(nvme_async_probe, dev);
return 0;
release_mempool:
mempool_destroy(dev->iod_mempool);
release_pools:
nvme_release_prp_pools(dev);
unmap:
nvme_dev_unmap(dev);
put_pci:
put_device(dev->dev);
free:
kfree(dev->queues);
kfree(dev);
return result;
}
//二、nvme_reset_work介绍
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev =
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result = -ENODEV;
enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
goto out;
/*
* If we're called to reset a live controller first shut it down before
* moving on.
*/
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
mutex_lock(&dev->shutdown_lock);
//只要是设置pci的配置空间寄存器以及NVME控制机的寄存器,下面详解
result = nvme_pci_enable(dev);
if (result)
goto out_unlock;
//申请admin queue,包含SQ与CQ,下面详解
result = nvme_pci_configure_admin_queue(dev);
if (result)
goto out_unlock;
result = nvme_alloc_admin_tags(dev);
if (result)
goto out_unlock;
/*
* Limit the max command size to prevent iod->sg allocations going
* over a single page.
*/
dev->ctrl.max_hw_sectors = NVME_MAX_KB_SZ << 1;//单次可传输的最大扇区数。
dev->ctrl.max_segments = NVME_MAX_SEGS;//最大分段数
mutex_unlock(&dev->shutdown_lock);
/*
* Introduce CONNECTING state from nvme-fc/rdma transports to mark the
* initializing procedure here.
*/
//读取dev->ctrl->state
if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(dev->ctrl.device,
"failed to mark controller CONNECTING\n");
goto out;
}
result = nvme_init_identify(&dev->ctrl);
if (result)
goto out;
if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
if (!dev->ctrl.opal_dev)
dev->ctrl.opal_dev =
init_opal_dev(&dev->ctrl, &nvme_sec_submit);
else if (was_suspend)
opal_unlock_from_suspend(dev->ctrl.opal_dev);
} else {
free_opal_dev(dev->ctrl.opal_dev);
dev->ctrl.opal_dev = NULL;
}
//如果支持门铃缓冲配置命令(Doorbell Buffer Config command)
if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
//为dev->dbbuf_dbs和dev->dbbuf_eis分配内存
//物理内存存在dev->dbbuf_dbs_dma_addr和dev->dbbuf_eis_dma_addr中
result = nvme_dbbuf_dma_alloc(dev);
}
//主机内存缓冲区首选大小已经设置
if (dev->ctrl.hmpre) {
//建立set featrue command(opcode=0x09 fid=0x0d),设置Host Memory Buffer
//dword11-15参数可参考Figure 330: Host Memory Buffer – Command Dword 11 - Figure 334
result = nvme_setup_host_mem(dev);
}
result = nvme_setup_io_queues(dev);
if (result)
goto out;
/*
* Keep the controller around but remove all namespaces if we don't have
* any working I/O queue.
*/
if (dev->online_queues < 2) {
dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
new_state = NVME_CTRL_ADMIN_ONLY;
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
/* hit this only when allocate tagset fails */
if (nvme_dev_add(dev))
new_state = NVME_CTRL_ADMIN_ONLY;
nvme_unfreeze(&dev->ctrl);
}
/*
* If only admin queue live, keep it to do further investigation or
* recovery.
*/
if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
dev_warn(dev->ctrl.device,
"failed to mark controller state %d\n", new_state);
goto out;
}
nvme_start_ctrl(&dev->ctrl);
return;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
out:
nvme_remove_dead_ctrl(dev, result);
}
//2.1 nvme_reset_work-->nvme_pci_enable介绍
static int nvme_pci_enable(struct nvme_dev *dev)
{
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
//使能nvme设备的内存空间,设置pci配置空间的COMMAND位的bit10
//使能COMMAND寄存器的 I / O 和 Memory Space 位之后, 才能访问该设备的存储器或者 I / O 地址空间。
if (pci_enable_device_mem(pdev))
return result;
//设置PCI 设备的配置空间的COMMAND寄存器的bit2(Bus Master位),设置PCI设备为主设备
pci_set_master(pdev);
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
//读nvme的status寄存器
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
goto disable;
}
/*
* Some devices and/or platforms don't advertise or work with INTx
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
//设置IRQ
result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
if (result < 0)
return result;
//读取nvme控制器的CAP寄存器
dev->ctrl.cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
//读取nvme控制器的CAP寄存器的MQES位(该位表示控制器支持的最大单个队列长度),
//io_queue_depth取MQES与io_queue_depth较小值
dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1,
io_queue_depth);
//读取CAP寄存器的DSTRD(Doorbell stride)位,该位表示doorbell寄存器之间的间隔,
//间隔为2^(2+DSTRD)
dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap);
//doorbell寄存器的起始地址,即SQ0TDBL的地址
dev->dbs = dev->bar + 4096;
nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
return 0;
disable:
pci_disable_device(pdev);
return result;
}
//2.2 nvme_reset_work-->nvme_pci_configure_admin_queue介绍
static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
{
int result;
u32 aqa;
struct nvme_queue *nvmeq;
//将pci bar通过ioremap到虚拟地址空间
//初始化dev->bar,dev->bar_mapped_size(8192),dev->dbs寄存器的地址
result = nvme_remap_bar(dev, db_bar_size(dev, 0));
if (result < 0)
return result;
//读取nvme控制器的VS寄存器,获取版本,如果>v1.1.0,d读取状态寄存器的NSSRO位赋给dev->subsystem
dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1, 0) ?
NVME_CAP_NSSRC(dev->ctrl.cap) : 0;
if (dev->subsystem &&
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
//写控制器状态寄存器NSSRO位为1,支持nvm subsystem reset
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
//将NVME的CC寄存器EN(清0时,控制器不在处理命令)位以及SHN位清0,等待STATUS RDY位清0
result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
if (result < 0)
return result;
//1.使用dma_alloc_coherent分配Complete Queue,虚拟地址保存到nvmeq->cqes,物理地址保存到nvmeq->cq_dma_addr
//2.调用nvme_alloc_sq_cmds-->dma_alloc_coherent来处理submission queue(qid为0 ,无法使用pci_alloc_p2pmem 分配内存)
// 虚拟地址nvmeq->sq_cmds,物理地址保存到nvmeq->sq_dma_addr
//3. 初始化dev->queues[0]结构体
result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH);
if (result)
return result;
nvmeq = &dev->queues[0];
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
//设置NVME控制寄存器的AQA寄存器,将q_depth写入该寄存器的ACQS位(定义管理完成队列大小)
//和ASQS位(定义管理提交队列的大小)
writel(aqa, dev->bar + NVME_REG_AQA);
//将上面分配的SQ物理地址写入ASQ寄存器(该寄存器保存提交队列的内存基地址)中
lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ);
//将上面分配的CQ物理地址写入ASQ寄存器(该寄存器保存完成队列的内存基地址)中
lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ);
//设置NVMe控制器的CC寄存器,等待CSTS寄存器的RDY
result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap);
if (result)
return result;
nvmeq->cq_vector = 0;
//1. 初始化dev->queues[0]一些队列相关的结构体
//2. 使用memeset将cqes队列清0
//3. 调用 nvme_dbbuf_init 函数来初始化设备的 Doorbell 缓冲区。
nvme_init_queue(nvmeq, 0);
//请求分配一个队列的中断,nvmeq->cq_vector保存终端向量号,nvme_irq为中断处理函数
//nvmeq为传递给中断处理函数的参数
result = queue_request_irq(nvmeq);
if (result) {
nvmeq->cq_vector = -1;
return result;
}
//将nvmeq->flags的第0位置1,
set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
}
//2.3 nvme_reset_work --> nvme_alloc_admin_tags详解
static int nvme_alloc_admin_tags(struct nvme_dev *dev)
{
if (!dev->ctrl.admin_q) {
//struct blk_mq_tag_set结构体主要是包含了块设备的硬件配置信息
//设置操作函数
dev->admin_tagset.ops = &nvme_mq_admin_ops;
//队列个数
dev->admin_tagset.nr_hw_queues = 1;
//队列深度
dev->admin_tagset.queue_depth = NVME_AQ_MQ_TAG_DEPTH;
//admin command超时时间
dev->admin_tagset.timeout = ADMIN_TIMEOUT;
dev->admin_tagset.numa_node = dev_to_node(dev->dev);
//cmd长度
dev->admin_tagset.cmd_size = nvme_pci_cmd_size(dev, false);
dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED;
dev->admin_tagset.driver_data = dev;
//1. 硬件调度队列数>cpu数时,硬件调度队列数=cpu数
//2. 给blk_mq_tag_set*set->tags[]分配nr_cpu个blk_mq_tags指针,
//3. 给dev->admin_tagset->map[i].mq_map分配nr_cpu个,用于软硬队列映射,下标为cpu的编号,数组成员是硬件队列编号
//4. 调用blk_mq_map_queues在 CPU 和队列之间进行顺序映射。
//5. 调用blk_mq_alloc_rq_maps对每个硬件队列,根据队列深度来分配tag bitmap和request,
// 分配的request指针最终保存到tags->static_rqs[i]
if (blk_mq_alloc_tag_set(&dev->admin_tagset))
return -ENOMEM;
dev->ctrl.admin_tagset = &dev->admin_tagset;
//1. 分配request queue,初始化struct request_queue(dev->ctrl.admin_q)
//2. 分配软件队列与硬件队列,初始化并建立二者的联系
dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset);
if (IS_ERR(dev->ctrl.admin_q)) {
blk_mq_free_tag_set(&dev->admin_tagset);
return -ENOMEM;
}
if (!blk_get_queue(dev->ctrl.admin_q)) {
nvme_dev_remove_admin(dev);
dev->ctrl.admin_q = NULL;
return -ENODEV;
}
} else
blk_mq_unquiesce_queue(dev->ctrl.admin_q);
return 0;
}
//2.4 nvme_reset_work --> nvme_init_identify详解
int nvme_init_identify(struct nvme_ctrl *ctrl)
{
struct nvme_id_ctrl *id;
u64 cap;
int ret, page_shift;
u32 max_hw_sectors;
bool prev_apst_enabled;
//读取版本号
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
if (ret) {
dev_err(ctrl->device, "Reading VS failed (%d)\n", ret);
return ret;
}
//获取cap寄存器的值
ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap);
if (ret) {
dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret);
return ret;
}
//获取nvme控制寄存器的cap寄存器的MPSMIN位(最小内存页大小),最小内存页为2^(12 + MPSMIN)
page_shift = NVME_CAP_MPSMIN(cap) + 12;
//nvmeV1.1.0以上的版本支持子系统
if (ctrl->vs >= NVME_VS(1, 1, 0))
ctrl->subsystem = NVME_CAP_NSSRC(cap);
//建立identify command(opcode=0x06 cns=1),并submit执行,下发后,返回4KB的– Identify Controller Data Structure
//返回值保存在id中
//id结构体的描述位于NVM-Express-base-specification-2.0c-2022.10.04:Figure 275: Identify – Identify Controller Data Structure, I/O Command Set Independent P258
ret = nvme_identify_ctrl(ctrl, &id);
if (ret) {
dev_err(ctrl->device, "Identify Controller failed (%d)\n", ret);
return -EIO;
}
//lpa的bit1如果为1,表示支持Commands Supported and Effects log page
if (id->lpa & NVME_CTRL_LPA_CMD_EFFECTS_LOG) {
//建立get log page 命令(opcode=0x2),返回的结果保存在ctrl->effects
//注:返回值描述位于NVM-Express-base-specification-2.0c-2022.10.04:Figure 210: Commands Supported and Effects Log Page P200
ret = nvme_get_effects_log(ctrl);
if (ret < 0)
goto out_free;
}
//identified用于标识控制器是否已被识别,未被识别,进入初始化
if (!ctrl->identified) {
int i;
//获取的id值来初始化nvme子系统
ret = nvme_init_subsystem(ctrl, id);
if (ret)
goto out_free;
for (i = 0; i < ARRAY_SIZE(core_quirks); i++) {
if (quirk_matches(id, &core_quirks[i]))
ctrl->quirks |= core_quirks[i].quirks;
}
}
if (force_apst && (ctrl->quirks & NVME_QUIRK_NO_DEEPEST_PS)) {
dev_warn(ctrl->device, "forcibly allowing all power states due to nvme_core.force_apst -- use at your own risk\n");
ctrl->quirks &= ~NVME_QUIRK_NO_DEEPEST_PS;
}
//用获取的id数据初始化控制器请求超时数据,参数详见Figure 92: Completion Queue Entry: Status Field
//当CQE CRD字段为1时,命令重发延时选用crd1,为2时,用crd2...
ctrl->crdt[0] = le16_to_cpu(id->crdt1);
ctrl->crdt[1] = le16_to_cpu(id->crdt2);
ctrl->crdt[2] = le16_to_cpu(id->crdt3);
//支持的命令集
ctrl->oacs = le16_to_cpu(id->oacs);
//可选命令集
ctrl->oncs = le16_to_cpup(&id->oncs);
//可选异步时间支持
ctrl->oaes = le32_to_cpu(id->oaes);
//abort命令的最大个数
atomic_set(&ctrl->abort_limit, id->acl + 1);
//写写入缓存开关
ctrl->vwc = id->vwc;
//最大传输数据大小
if (id->mdts)
max_hw_sectors = 1 << (id->mdts + page_shift - 9);
else
max_hw_sectors = UINT_MAX;
//单次最大可传输的扇区数
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
//设置request queue参数
nvme_set_queue_limits(ctrl, ctrl->admin_q);
//是否支持sgl
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
//最大命名空间数
ctrl->max_namespaces = le32_to_cpu(id->mnan);
//控制器属性
ctrl->ctratt = le32_to_cpu(id->ctratt);
//进入D3的延迟
if (id->rtd3e) {
/* us -> s */
u32 transition_time = le32_to_cpu(id->rtd3e) / 1000000;//换算为秒
ctrl->shutdown_timeout = clamp_t(unsigned int, transition_time,
shutdown_timeout, 60);
if (ctrl->shutdown_timeout != shutdown_timeout)
dev_info(ctrl->device,
"Shutdown timeout set to %u seconds\n",
ctrl->shutdown_timeout);
} else
ctrl->shutdown_timeout = shutdown_timeout;
ctrl->npss = id->npss;// NVMe 控制器支持的不同电源状态的数量
ctrl->apsta = id->apsta;//用于控制控制器是否允许自主进行电源状态的转换
prev_apst_enabled = ctrl->apst_enabled;//APST(省电状态传输)是否已启用
if (ctrl->quirks & NVME_QUIRK_NO_APST) {
if (force_apst && id->apsta) {
dev_warn(ctrl->device, "forcibly allowing APST due to nvme_core.force_apst -- use at your own risk\n");
ctrl->apst_enabled = true;
} else {
ctrl->apst_enabled = false;
}
} else {
ctrl->apst_enabled = id->apsta;
}
memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));//电源状态描述符
if (ctrl->ops->flags & NVME_F_FABRICS) {//fabrics设备
ctrl->icdoff = le16_to_cpu(id->icdoff);
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
ctrl->iorcsz = le32_to_cpu(id->iorcsz);
ctrl->maxcmd = le16_to_cpu(id->maxcmd);
/*
* In fabrics we need to verify the cntlid matches the
* admin connect
*/
if (ctrl->cntlid != le16_to_cpu(id->cntlid)) {
ret = -EINVAL;
goto out_free;
}
if (!ctrl->opts->discovery_nqn && !ctrl->kas) {
dev_err(ctrl->device,
"keep-alive support is mandatory for fabrics\n");
ret = -EINVAL;
goto out_free;
}
} else {
ctrl->cntlid = le16_to_cpu(id->cntlid);//控制器ID
ctrl->hmpre = le32_to_cpu(id->hmpre);//主机内存缓冲区大小,以4kib为单位
ctrl->hmmin = le32_to_cpu(id->hmmin);主机内存缓冲区最小大小,以4kib为单位
ctrl->hmminds = le32_to_cpu(id->hmminds);//最小数据传输速率
ctrl->hmmaxd = le16_to_cpu(id->hmmaxd);//最大数据传输速率
}
//初始化与多路径相关的控制结构
ret = nvme_mpath_init(ctrl, id);
kfree(id);
if (ret < 0)
return ret;
if (ctrl->apst_enabled && !prev_apst_enabled)
dev_pm_qos_expose_latency_tolerance(ctrl->device);
else if (!ctrl->apst_enabled && prev_apst_enabled)
dev_pm_qos_hide_latency_tolerance(ctrl->device);
//建立set featrue command(opcode=0x09 fid=0x0c),设置Autonomous Power State Transition
//参见Figure 327: Autonomous Power State Transition – Command Dword 11
ret = nvme_configure_apst(ctrl);
if (ret < 0)
return ret;
//与上类似,fid不同
ret = nvme_configure_timestamp(ctrl);
if (ret < 0)
return ret;
ret = nvme_configure_directives(ctrl);
if (ret < 0)
return ret;
ret = nvme_configure_acre(ctrl);
if (ret < 0)
return ret;
ctrl->identified = true;
return 0;
out_free:
kfree(id);
return ret;
}
//2.5 nvme_reset_work --> nvme_setup_io_queues详解
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, nr_io_queues;
unsigned long size;
nr_io_queues = max_io_queues();
//发送set feature cmd设置IO queues数目
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
if (nr_io_queues == 0)
return 0;
clear_bit(NVMEQ_ENABLED, &adminq->flags);
//支持cmb
if (dev->cmb_use_sqes) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0)
dev->q_depth = result;
else
dev->cmb_use_sqes = false;
}
do {
// 计算所需的bar size
size = db_bar_size(dev, nr_io_queues);
//映射bar空间
result = nvme_remap_bar(dev, size);
if (!result)
break;
if (!--nr_io_queues)
return -ENOMEM;
} while (1);
adminq->q_db = dev->dbs;
retry:
/* Deregister the admin queue's interrupt */
pci_free_irq(pdev, 0, adminq);
/*
* If we enable msix early due to not intx, disable it again before
* setting up the full range we need.
*/
pci_free_irq_vectors(pdev);
result = nvme_setup_irqs(dev, nr_io_queues);
if (result <= 0)
return -EIO;
dev->num_vecs = result;
result = max(result - 1, 1);
dev->max_qid = result + dev->io_queues[HCTX_TYPE_POLL];
/*
* Should investigate if there's a performance win from allocating
* more queues than interrupt vectors; it might allow the submission
* path to scale better, even if the receive path is limited by the
* number of interrupts.
*/
//申请irq,中断处理函数是nvme_irq
result = queue_request_irq(adminq);
if (result) {
adminq->cq_vector = -1;
return result;
}
set_bit(NVMEQ_ENABLED, &adminq->flags);
//详见2.5.1
result = nvme_create_io_queues(dev);
if (result || dev->online_queues < 2)
return result;
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
nvme_disable_io_queues(dev);
nvme_suspend_io_queues(dev);
goto retry;
}
dev_info(dev->ctrl.device, "%d/%d/%d default/read/poll queues\n",
dev->io_queues[HCTX_TYPE_DEFAULT],
dev->io_queues[HCTX_TYPE_READ],
dev->io_queues[HCTX_TYPE_POLL]);
return 0;
}
//2.5.1 nvme_reset_work --> nvme_setup_io_queues -->nvme_create_io_queues详解
static int nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max, rw_queues;
int ret = 0;
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
//分配nvmeq结构体,并记录到dev->queues[]数组中,并分配 IO submit queue 和IO complete queue所需要的空间
//dev->queues[qid]->cqes记录虚拟地址和dev->queues[qid]->cq_dma_addr记录物理地址,
//初始化dev->queues[qid]结构体
if (nvme_alloc_queue(dev, i, dev->q_depth)) {
ret = -ENOMEM;
break;
}
}
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
if (max != 1 && dev->io_queues[HCTX_TYPE_POLL]) {
rw_queues = dev->io_queues[HCTX_TYPE_DEFAULT] +
dev->io_queues[HCTX_TYPE_READ];
} else {
rw_queues = max;
}
for (i = dev->online_queues; i <= max; i++) {
bool polled = i > rw_queues;
//详见2.5.1.1
ret = nvme_create_queue(&dev->queues[i], i, polled);
if (ret)
break;
}
return ret >= 0 ? 0 : ret;
}
//2.5.1.1 nvme_reset_work --> nvme_setup_io_queues -->nvme_create_io_queues --> nvme_create_queue详解
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
s16 vector;
clear_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags);
/*
* A queue's vector matches the queue identifier unless the controller
* has only one vector available.
*/
if (!polled)
vector = dev->num_vecs == 1 ? 0 : qid;
else
vector = -1;
//发送create cq cmd创建CQ队列
//详见NVM-Express-Base-Specification 5.4 Create I/O Completion Queue command
result = adapter_alloc_cq(dev, qid, nvmeq, vector);
if (result)
return result;
//发送create sq cmd创建SQ队列
//详见NVM-Express-Base-Specification. 5.5 Create I/O Submission Queue command
result = adapter_alloc_sq(dev, qid, nvmeq);
if (result < 0)
return result;
else if (result)
goto release_cq;
nvmeq->cq_vector = vector;
//1. 初始化dev->queues[qid]一些队列相关的结构体
//2. 使用memeset将cqes队列清0
//3. 调用 nvme_dbbuf_init 函数来初始化设备的 Doorbell 缓冲区。
nvme_init_queue(nvmeq, qid);
if (vector != -1) {
result = queue_request_irq(nvmeq);
if (result < 0)
goto release_sq;
}
set_bit(NVMEQ_ENABLED, &nvmeq->flags);
return result;
release_sq:
nvmeq->cq_vector = -1;
dev->online_queues--;
adapter_delete_sq(dev, qid);
release_cq:
adapter_delete_cq(dev, qid);
return result;
}