这篇文章紧接上回分解,在nvme_probe函数的最后一步调用nvme_reset_work进行reset操作,nvme_reset_work的主要工作可以概括如下几个步骤:
进入nvme_reset_work函数后先检查NVME_CTRL_RESETTING标志,来确保nvme_reset_work不会被重复进入。
调用nvme_pci_enable
调用nvme_configure_admin_queue
调用nvme_init_queue
调用nvme_alloc_admin_tags
调用nvme_init_identify
调用nvme_setup_io_queues
调用nvme_start_queues/nvme_dev_add之后,接着调用nvme_queue_scan
上篇文章中,我们解析了nvme_init_identify内容,本文我们接着介绍nvme_reset_work中的其他函数。
我们来看看nvme_setup_io_queues的代码:
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
struct nvme_queue *adminq = dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, nr_io_queues, size;
// 获取cpu core数目并赋值给nr_io_queues
nr_io_queues = num_online_cpus();
// 发送set feature cmd设置IO queues数目
result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
if (result < 0)
return result;
if (nr_io_queues == 0)
return 0;
if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0)
dev->q_depth = result;
else
nvme_release_cmb(dev);
}
// 计算所需的bar size
size = db_bar_size(dev, nr_io_queues);
if (size > 8192) {
iounmap(dev->bar);
do {
dev->bar = ioremap(pci_resource_start(pdev, 0), size);
if (dev->bar)
break;
if (!--nr_io_queues)
return -ENOMEM;
size = db_bar_size(dev, nr_io_queues);
} while (1);
dev->dbs = dev->bar + 4096;
adminq->q_db = dev->dbs;
}
free_irq(pci_irq_vector(pdev, 0), adminq);
pci_free_irq_vectors(pdev);
// 分配中断
nr_io_queues = pci_alloc_irq_vectors(pdev, 1, nr_io_queues,
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY);
if (nr_io_queues <= 0)
return -EIO;
dev->max_qid = nr_io_queues;
result = queue_request_irq(adminq);
if (result) {
adminq->cq_vector = -1;
return result;
}
// 创建IO queues
return nvme_create_io_queues(dev);
}
上面代码虽然看上去很长,但是我们只抓重点,在nvme_setup_io_queues函数执行的过程中主要分为两步:
-
调用nvme_set_queue_count发送set feature cmd设置IO queues的数目;
-
确定了IO queues的数目之后,调用nvme_creat_io_queues函数开始真正干活了,完成IO queues的创建。
我们先看看nvme_set_queue_count的内容:
int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count)
{
u32 q_count = (*count - 1) | ((*count - 1) << 16);
u32 result;
int status, nr_io_queues;
status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0,
&result);
if (status < 0)
return status;
if (status > 0) {
dev_err(ctrl->dev, "Could not set queue count (%d)\n", status);
*count = 0;
} else {
nr_io_queues = min(result & 0xffff, result >> 16) + 1;
*count = min(*count, nr_io_queues);
}
return 0;
}
再打开nvme_set_features:
int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
void *buffer, size_t buflen, u32 *result)
{
struct nvme_command c;
union nvme_result res;
int ret;
memset(&c, 0, sizeof(c));
c.features.opcode = nvme_admin_set_features;
c.features.fid = cpu_to_le32(fid);
c.features.dword11 = cpu_to_le32(dword11);
ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &res,
buffer, buflen, 0, NVME_QID_ANY, 0, 0);
if (ret >= 0 && result)
*result = le32_to_cpu(res.u32);
return ret;
}
set feature的opcode是0x09h, 如下图:
IO queues数目设置在set feature command中的feature ID=0x7h, 如下图:
IO queues的具体数目在Dword11设置,如下图,
set feature command的三个关键参数(opcode, fid, dword11)配置完成后,再调用__nvme_submit_sync_cmd去执行,最终完成IO queues数目的设置,__nvme_submit_sync_cmd的执行过程已在上篇文章介绍,这里不再展开,欲了解,请移步上文:
至此,IO queues数目的配置工作完成,那我们接下来就看看nvme_setup_io_queues函数中最关键的一步:调用nvme_creat_io_queues函数创建IO queues.
static int nvme_create_io_queues(struct nvme_dev *dev)
{
unsigned i, max;
int ret = 0;
//分配nvmeq结构体,并记录到dev->queues[]数组中,并分配submit queue 和complete queue所需要的空间
for (i = dev->queue_count; i <= dev->max_qid; i++) {
if (!nvme_alloc_queue(dev, i, dev->q_depth)) {
ret = -ENOMEM;
break;
}
}
max = min(dev->max_qid, dev->queue_count - 1);
for (i = dev->online_queues; i <= max; i++) {
ret = nvme_create_queue(dev->queues[i], i);
if (ret)
break;
}
/*
* Ignore failing Create SQ/CQ commands, we can continue with less
* than the desired aount of queues, and even a controller without
* I/O queues an still be used to issue admin commands. This might
* be useful to upgrade a buggy firmware for example.
*/
return ret >= 0 ? 0 : ret;
}
上面的代码显示nvme_create_io_queue在创建IO queues过程中主要进行了两步:
-
调用nvme_alloc_queue申请SQ/CQ所需内存,这部分已在
nvme_configure_admin_queue解析过程中介绍,这里略过了。详细过程请参考: Linux NVMe Driver学习笔记之5:Admin SQ/CQ的创建
-
第二步是关键,调用nvme_create_queue真正实现SQ/CQ的创建。
我们接下来先看看nvme_create_queue的代码:
static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
{
struct nvme_dev *dev = nvmeq->dev;
int result;
nvmeq->cq_vector = qid - 1;
result = adapter_alloc_cq(dev, qid, nvmeq);
if (result < 0)
return result;
result = adapter_alloc_sq(dev, qid, nvmeq);
if (result < 0)
goto release_cq;
result = queue_request_irq(nvmeq);
if (result < 0)
goto release_sq;
nvme_init_queue(nvmeq, qid);
return result;
release_sq:
adapter_delete_sq(dev, qid);
release_cq:
adapter_delete_cq(dev, qid);
return result;
}
从代码显示,nvme_create_queue函数先通过调用adapter_alloc_cq和adapter_alloc_sq创建CQ/SQ, 然后在调用queue_request_irq申请中断,最后调用nvme_init_queue初始化前面创建的CQ/SQ.
鉴于queue_request_irq和nvme_init_queue这两个函数已在之前的文章中介绍,这里略过了。详细过程请参考: Linux NVMe Driver学习笔记之5:Admin SQ/CQ的创建
我们这里主要分析adapter_alloc_cq和adapter_alloc_sq:
static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq)
{
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
/*
* Note: we (ab)use the fact the the prp fields survive if no data
* is attached to the request.
*/
memset(&c, 0, sizeof(c));
c.create_cq.opcode = nvme_admin_create_cq;
c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr);
c.create_cq.cqid = cpu_to_le16(qid);
c.create_cq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_cq.cq_flags = cpu_to_le16(flags);
c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
Create I/O completion queue command的Opcode=0x05
PRP Entry 1要结合PC(Physically Contiguous) flag:
-
如果PC=1, 代表CQ物理空间是连续的,指向PRP1;
-
如果PC=0, 代表CQ物理空间是不连续的,指向PRP list.

PRP结构及实例详细解析,请参考:
cqid和qsize分别代表CQ的编号以及队列深度:

cq_flags在这里是pc flag与IEN flag合并的值,irq_vector代表是CQ的中断向量。
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
有关中断模式的具体区别请参考:
create I/O completion queue command的6个关键参数(opcode, PRP1, cqid, qsize, cq_flags, irq_vector)配置完成后,再调用__nvme_submit_sync_cmd去执行,最终完成IO queues数目的设置,__nvme_submit_sync_cmd的执行过程已在上篇文章介绍,这里不再展开,欲了解,请移步上文:
static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid,
struct nvme_queue *nvmeq)
{
struct nvme_command c;
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
/*
* Note: we (ab)use the fact the the prp fields survive if no data
* is attached to the request.
*/
memset(&c, 0, sizeof(c));
c.create_sq.opcode = nvme_admin_create_sq;
c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr);
c.create_sq.sqid = cpu_to_le16(qid);
c.create_sq.qsize = cpu_to_le16(nvmeq->q_depth - 1);
c.create_sq.sq_flags = cpu_to_le16(flags);
c.create_sq.cqid = cpu_to_le16(qid);
return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
}
create I/O SQ命令中的opcode, prp1,sqid,qsize与create I/O CQ命令类似, 这里主要提一下sq_flags和cqid.
cq_flags在这里是pc flag与QPRIO flag合并的值,
int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM;
QPRIO=Queue Priority, 代表了SQ中命令的冲裁级别。在NVMe Spec没有规定Command存入SQ队列的执行顺序,Controller可以一次取出多个Command进行批量处理。一个SQ队列中的Command执行顺序是不固定,同时在多个SQ队列之间的Command执行顺序也不固定,这就涉及到了NVMe Spec定义的命令仲裁机制。更详细的介绍请参考:
create I/O CQ命令相比,create I/O SQ命令多了一个cqid值。因为SQ和CQ是相互对应的,IO SQ和CQ可以一对一,也可以多对一。有关SQ和CQ的具体介绍,请参考:
create I/O submission queue command的6个关键参数(opcode, PRP1, sqid, qsize, sq_flags, cqid)配置完成后,再调用__nvme_submit_sync_cmd去执行,最终完成IO queues数目的设置,__nvme_submit_sync_cmd的执行过程已在上篇文章介绍,这里不再展开,欲了解,请移步上文: