在nvme_probe函数的最后会调用 queue_work(nvme_workq, &dev->reset_work);来执行reset操作
而这里的dev->reset_work 同样是在nvme_probe赋值INIT_WORK(&dev->reset_work, nvme_reset_work);
nvme_reset_work 这个函数很长很长,如果慢慢看
struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
int result = -ENODEV;
if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING))
goto out;
首先根据work_struct在nvme_dev中的位置,通过container_of得到nvme_dev,然后判断dev->ctrl.state 是否等于NVME_CTRL_RESETTING,如果等于的话,就退出.
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
再根据flag是是否包含NVME_CC_ENABLE,也就是已经enable了话,就调用nvme_dev_disable 暂时disable nvme,但是不用shutdown
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
int i, queues;
u32 csts = -1;
//首先删除watchdog的timer
del_timer_sync(&dev->watchdog_timer);
mutex_lock(&dev->shutdown_lock);
//如果nvme的pcie设备已经enable,则读取bar空间偏移为NVME_REG_CSTS
if (pci_is_enabled(to_pci_dev(dev->dev))) {
nvme_stop_queues(&dev->ctrl);
csts = readl(dev->bar + NVME_REG_CSTS);
}
//得到online的queue
queues = dev->online_queues - 1;
对每个queue进行suspend
for (i = dev->queue_count - 1; i > 0; i--)
nvme_suspend_queue(dev->queues[i]);
if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
/* A device might become IO incapable very soon during
* probe, before the admin queue is configured. Thus,
* queue_count can be 0 here.
*/
如果bar空间的ccts包含NVME_CSTS_CFS 但是不包含NVME_CSTS_RDY,则对queue[0]进行suspend操作.
if (dev->queue_count)
nvme_suspend_queue(dev->queues[0]);
} else {
//queue分为两种,一种是是io_queues ,一种是admin queue
nvme_disable_io_queues(dev, queues);
nvme_disable_admin_queue(dev, shutdown);
}
nvme_pci_disable(dev);
//让block层取消tagset和admin_tagset
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
}
回到nvme_reset_work 中通过nvme_change_ctrl_state 来将保存在ctrl->state中
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
enum nvme_ctrl_state old_state;
bool changed = false;
spin_lock_irq(&ctrl->lock);
old_state = ctrl->state;
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
我们这里的形参是NVME_CTRL_RESETTING,因此还要判断old_state 是否是NVME_CTRL_NEW/NVME_CTRL_LIVE/NVME_CTRL_LIVE,如果是的话,则让changed = true
case NVME_CTRL_RESETTING:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_LIVE:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_RECONNECTING:
switch (old_state) {
case NVME_CTRL_LIVE:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_DEAD:
switch (old_state) {
case NVME_CTRL_DELETING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
default:
break;
}
如果old_state 是否是NVME_CTRL_NEW/NVME_CTRL_LIVE/NVME_CTRL_LIVE,且new_states是NVME_CTRL_RESETTING的话,则保存NVME_CTRL_RESETTING到ctrl->state 中
if (changed)
ctrl->state = new_state;
spin_unlock_irq(&ctrl->lock);
return changed;
}
回到nvme_reset_work中继续调用nvme_pci_enable来enable nvme这个pcie设备
static int nvme_pci_enable(struct nvme_dev *dev)
{
u64 cap;
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
//判断当前设备是否有iomem,如果没有的话,直接退出。源码见后面分析
if (pci_enable_device_mem(pdev))
return result;
//给pcie这个设备偏移为PCI_COMMAND或上PCI_COMMAND_MASTER,和偏移为PCI_LATENCY_TIMER的或上pcibios_max_latency,详细代码后面分析
pci_set_master(pdev);
//设定这个pcie设备的DMA的使用范围,默认用64bit,如果64 不支持的话,则用32bit
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
goto disable;
}
/*
* Some devices and/or platforms don't advertise or work with INTx
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
//enable 中断,中断分为3中,PCI_IRQ_MSIX/PCI_IRQ_MSI/PCI_IRQ_LEGACY
result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
if (result < 0)
return result;
//根据cap设定dbs
cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
dev->dbs = dev->bar + 4096;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
//苹果公司的bug。
if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
dev->q_depth = 2;
dev_warn(dev->dev, "detected Apple NVMe controller, set "
"queue depth=%u to work around controller resets\n",
dev->q_depth);
}
/*
* CMBs can currently only exist on >=1.2 PCIe devices. We only
* populate sysfs if a CMB is implemented. Note that we add the
* CMB attribute to the nvme_ctrl kobj which removes the need to remove
* it on exit. Since nvme_dev_attrs_group has no name we can pass
* NULL as final argument to sysfs_add_file_to_group.
*/
if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
dev->cmb = nvme_map_cmb(dev);
if (dev->cmbsz) {
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL))
dev_warn(dev->dev,
"failed to add sysfs attribute for CMB\n");
}
}
//处理pcie的error,如果没有在kernel config中打开CONFIG_PCIEAER,则为空函数
pci_enable_pcie_error_reporting(pdev);
//保存状态,有三种状态pcie/pcix/vc
pci_save_state(pdev);
return 0;
disable:
pci_disable_device(pdev);
return result;
}
可见在pci_enable_device_mem 中是通过pci_enable_device_flags 来判断是否包含IORESOURCE_MEM
int pci_enable_device_mem(struct pci_dev *dev)
{
return pci_enable_device_flags(dev, IORESOURCE_MEM);
}
pci_set_master 主要是设置PCI_COMMAND_MASTER和PCI_LATENCY_TIMER
void pci_set_master(struct pci_dev *dev)
{
__pci_set_master(dev, true);
pcibios_set_master(dev);
}
在__pci_set_master 中加上或去掉PCI_COMMAND_MASTER
static void __pci_set_master(struct pci_dev *dev, bool enable)
{
u16 old_cmd, cmd;
pci_read_config_word(dev, PCI_COMMAND, &old_cmd);
if (enable)
cmd = old_cmd | PCI_COMMAND_MASTER;
else
cmd = old_cmd & ~PCI_COMMAND_MASTER;
if (cmd != old_cmd) {
dev_dbg(&dev->dev, "%s bus mastering\n",
enable ? "enabling" : "disabling");
pci_write_config_word(dev, PCI_COMMAND, cmd);
}
dev->is_busmaster = enable;
}
在pcibios_set_master 中给PCI_LATENCY_TIMER赋值.
void __weak pcibios_set_master(struct pci_dev *dev)
{
u8 lat;
/* The latency timer doesn't apply to PCIe (either Type 0 or Type 1) */
if (pci_is_pcie(dev))
return;
pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
if (lat < 16)
lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
else if (lat > pcibios_max_latency)
lat = pcibios_max_latency;
else
return;
pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
}
而这里的dev->reset_work 同样是在nvme_probe赋值INIT_WORK(&dev->reset_work, nvme_reset_work);
nvme_reset_work 这个函数很长很长,如果慢慢看
struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
int result = -ENODEV;
if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING))
goto out;
首先根据work_struct在nvme_dev中的位置,通过container_of得到nvme_dev,然后判断dev->ctrl.state 是否等于NVME_CTRL_RESETTING,如果等于的话,就退出.
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
再根据flag是是否包含NVME_CC_ENABLE,也就是已经enable了话,就调用nvme_dev_disable 暂时disable nvme,但是不用shutdown
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
int i, queues;
u32 csts = -1;
//首先删除watchdog的timer
del_timer_sync(&dev->watchdog_timer);
mutex_lock(&dev->shutdown_lock);
//如果nvme的pcie设备已经enable,则读取bar空间偏移为NVME_REG_CSTS
if (pci_is_enabled(to_pci_dev(dev->dev))) {
nvme_stop_queues(&dev->ctrl);
csts = readl(dev->bar + NVME_REG_CSTS);
}
//得到online的queue
queues = dev->online_queues - 1;
对每个queue进行suspend
for (i = dev->queue_count - 1; i > 0; i--)
nvme_suspend_queue(dev->queues[i]);
if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) {
/* A device might become IO incapable very soon during
* probe, before the admin queue is configured. Thus,
* queue_count can be 0 here.
*/
如果bar空间的ccts包含NVME_CSTS_CFS 但是不包含NVME_CSTS_RDY,则对queue[0]进行suspend操作.
if (dev->queue_count)
nvme_suspend_queue(dev->queues[0]);
} else {
//queue分为两种,一种是是io_queues ,一种是admin queue
nvme_disable_io_queues(dev, queues);
nvme_disable_admin_queue(dev, shutdown);
}
nvme_pci_disable(dev);
//让block层取消tagset和admin_tagset
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
}
回到nvme_reset_work 中通过nvme_change_ctrl_state 来将保存在ctrl->state中
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
enum nvme_ctrl_state old_state;
bool changed = false;
spin_lock_irq(&ctrl->lock);
old_state = ctrl->state;
switch (new_state) {
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
我们这里的形参是NVME_CTRL_RESETTING,因此还要判断old_state 是否是NVME_CTRL_NEW/NVME_CTRL_LIVE/NVME_CTRL_LIVE,如果是的话,则让changed = true
case NVME_CTRL_RESETTING:
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
case NVME_CTRL_LIVE:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_RECONNECTING:
switch (old_state) {
case NVME_CTRL_LIVE:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
case NVME_CTRL_DEAD:
switch (old_state) {
case NVME_CTRL_DELETING:
changed = true;
/* FALLTHRU */
default:
break;
}
break;
default:
break;
}
如果old_state 是否是NVME_CTRL_NEW/NVME_CTRL_LIVE/NVME_CTRL_LIVE,且new_states是NVME_CTRL_RESETTING的话,则保存NVME_CTRL_RESETTING到ctrl->state 中
if (changed)
ctrl->state = new_state;
spin_unlock_irq(&ctrl->lock);
return changed;
}
回到nvme_reset_work中继续调用nvme_pci_enable来enable nvme这个pcie设备
static int nvme_pci_enable(struct nvme_dev *dev)
{
u64 cap;
int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
//判断当前设备是否有iomem,如果没有的话,直接退出。源码见后面分析
if (pci_enable_device_mem(pdev))
return result;
//给pcie这个设备偏移为PCI_COMMAND或上PCI_COMMAND_MASTER,和偏移为PCI_LATENCY_TIMER的或上pcibios_max_latency,详细代码后面分析
pci_set_master(pdev);
//设定这个pcie设备的DMA的使用范围,默认用64bit,如果64 不支持的话,则用32bit
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
goto disable;
}
/*
* Some devices and/or platforms don't advertise or work with INTx
* interrupts. Pre-enable a single MSIX or MSI vec for setup. We'll
* adjust this later.
*/
//enable 中断,中断分为3中,PCI_IRQ_MSIX/PCI_IRQ_MSI/PCI_IRQ_LEGACY
result = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES);
if (result < 0)
return result;
//根据cap设定dbs
cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH);
dev->db_stride = 1 << NVME_CAP_STRIDE(cap);
dev->dbs = dev->bar + 4096;
/*
* Temporary fix for the Apple controller found in the MacBook8,1 and
* some MacBook7,1 to avoid controller resets and data loss.
*/
//苹果公司的bug。
if (pdev->vendor == PCI_VENDOR_ID_APPLE && pdev->device == 0x2001) {
dev->q_depth = 2;
dev_warn(dev->dev, "detected Apple NVMe controller, set "
"queue depth=%u to work around controller resets\n",
dev->q_depth);
}
/*
* CMBs can currently only exist on >=1.2 PCIe devices. We only
* populate sysfs if a CMB is implemented. Note that we add the
* CMB attribute to the nvme_ctrl kobj which removes the need to remove
* it on exit. Since nvme_dev_attrs_group has no name we can pass
* NULL as final argument to sysfs_add_file_to_group.
*/
if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
dev->cmb = nvme_map_cmb(dev);
if (dev->cmbsz) {
if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
&dev_attr_cmb.attr, NULL))
dev_warn(dev->dev,
"failed to add sysfs attribute for CMB\n");
}
}
//处理pcie的error,如果没有在kernel config中打开CONFIG_PCIEAER,则为空函数
pci_enable_pcie_error_reporting(pdev);
//保存状态,有三种状态pcie/pcix/vc
pci_save_state(pdev);
return 0;
disable:
pci_disable_device(pdev);
return result;
}
可见在pci_enable_device_mem 中是通过pci_enable_device_flags 来判断是否包含IORESOURCE_MEM
int pci_enable_device_mem(struct pci_dev *dev)
{
return pci_enable_device_flags(dev, IORESOURCE_MEM);
}
pci_set_master 主要是设置PCI_COMMAND_MASTER和PCI_LATENCY_TIMER
void pci_set_master(struct pci_dev *dev)
{
__pci_set_master(dev, true);
pcibios_set_master(dev);
}
在__pci_set_master 中加上或去掉PCI_COMMAND_MASTER
static void __pci_set_master(struct pci_dev *dev, bool enable)
{
u16 old_cmd, cmd;
pci_read_config_word(dev, PCI_COMMAND, &old_cmd);
if (enable)
cmd = old_cmd | PCI_COMMAND_MASTER;
else
cmd = old_cmd & ~PCI_COMMAND_MASTER;
if (cmd != old_cmd) {
dev_dbg(&dev->dev, "%s bus mastering\n",
enable ? "enabling" : "disabling");
pci_write_config_word(dev, PCI_COMMAND, cmd);
}
dev->is_busmaster = enable;
}
在pcibios_set_master 中给PCI_LATENCY_TIMER赋值.
void __weak pcibios_set_master(struct pci_dev *dev)
{
u8 lat;
/* The latency timer doesn't apply to PCIe (either Type 0 or Type 1) */
if (pci_is_pcie(dev))
return;
pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
if (lat < 16)
lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
else if (lat > pcibios_max_latency)
lat = pcibios_max_latency;
else
return;
pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
}