ARM64 pcie msi-x中断申请注册过程

承接上文:arm64 中断处理流程
我们来看一下pcie msi-x中断是怎样注册的,他的中断是写在设备树中的,还是动态分配的。先说一下结论,pcie msi-x的中断是内核动态分配的。

驱动调用过程

我们先来一段示例代码,展示驱动如何申请和使用pcie msi-x中断:

#include <linux/pci.h>
#include <linux/interrupt.h>

struct pci_dev *pdev;
int irq_vectors;
int irq;

// 假设 pdev 已经指向一个有效的 PCI 设备

// 尝试分配 1 到 4 个中断向量,允许使用 MSI-X
irq_vectors = pci_alloc_irq_vectors(pdev, 1, 4, PCI_IRQ_MSIX);

if (irq_vectors < 0) {
    // 处理错误
    printk(KERN_ERR "Failed to allocate MSI-X vectors: %d\n", irq_vectors);
} else {
    printk(KERN_INFO "Allocated %d MSI-X vectors\n", irq_vectors);

    // 获取主中断号(即第一个中断向量的中断号)
    irq = pci_irq_vector(pdev, 0);
    if (irq < 0) {
        printk(KERN_ERR "Failed to get IRQ number for vector 0: %d\n", irq);
    } else {
        printk(KERN_INFO "IRQ number for vector 0: %d\n", irq);

        // 注册中断处理函数
        if (request_irq(irq, my_interrupt_handler, 0, "my_device", pdev)) {
            printk(KERN_ERR "Failed to request IRQ %d\n", irq);
        } else {
            printk(KERN_INFO "IRQ %d requested successfully\n", irq);
        }
    }
}

最重要的有一下三个步骤:

  • pci_alloc_irq_vectors
  • pci_irq_vector
  • request_irq

request_irq我们就忽略了,不清楚的可以看之前的文章

主要模块梳理

pcie msi-x涉及到的模块比较多,主要有三个部分

  1. pcie框架部分
  2. irq框架部分
  3. gic v3 its部分

我们重点来看一下pci_alloc_irq_vectors,这个函数进来过后直接在头文件转到了pci_alloc_irq_vectors_affinity

drivers/pci/msi.c

int pci_alloc_irq_vectors_affinity(struct pci_dev *dev, unsigned int min_vecs,
				   unsigned int max_vecs, unsigned int flags,
				   struct irq_affinity *affd)
{
	struct irq_affinity msi_default_affd = {0};
	int nvecs = -ENOSPC;
	
	if (flags & PCI_IRQ_MSIX) {
		nvecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
						affd, flags);
		if (nvecs > 0)
			return nvecs;
	}

	if (flags & PCI_IRQ_MSI) {
		nvecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, affd);
		if (nvecs > 0)
			return nvecs;
	}
	return nvecs;
}

__pci_enable_msix_range

static int __pci_enable_msix_range(struct pci_dev *dev,
				   struct msix_entry *entries, int minvec,
				   int maxvec, struct irq_affinity *affd,
				   int flags)
{
	for (;;) {
		if (affd) {
			nvec = irq_calc_affinity_vectors(minvec, nvec, affd);
			if (nvec < minvec)
				return -ENOSPC;
		}

		rc = __pci_enable_msix(dev, entries, nvec, affd, flags);
	}
}

__pci_enable_msix

static int __pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries,
			     int nvec, struct irq_affinity *affd, int flags)
{
	nr_entries = pci_msix_vec_count(dev);
	if (nr_entries < 0)
		return nr_entries;
	if (nvec > nr_entries && !(flags & PCI_IRQ_VIRTUAL))
		return nr_entries;

	if (entries) {
		/* Check for any invalid entries */
		for (i = 0; i < nvec; i++) {
			if (entries[i].entry >= nr_entries)
				return -EINVAL;		/* invalid entry */
			for (j = i + 1; j < nvec; j++) {
				if (entries[i].entry == entries[j].entry)
					return -EINVAL;	/* duplicate entry */
			}
		}
	return msix_capability_init(dev, entries, nvec, affd);
}

msix_capability_init

static int msix_capability_init(struct pci_dev *dev, struct msix_entry *entries,
				int nvec, struct irq_affinity *affd)
{
	ret = msix_setup_entries(dev, base, entries, nvec, affd);
	ret = pci_msi_setup_msi_irqs(dev, nvec, PCI_CAP_ID_MSIX);
	msix_update_entries(dev, entries);
	return 0;
}

pci_msi_setup_msi_irqs

static int pci_msi_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
	struct irq_domain *domain;

	domain = dev_get_msi_domain(&dev->dev);
	if (domain && irq_domain_is_hierarchy(domain))
		return msi_domain_alloc_irqs(domain, &dev->dev, nvec);

	return arch_setup_msi_irqs(dev, nvec, type);
}

这里才到重点,msi_domain_alloc_irqs开始真正干事情
kernel/irq/msi.c

int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
			  int nvec)
{
	struct msi_domain_info *info = domain->host_data;
	struct msi_domain_ops *ops = info->ops;

	return ops->domain_alloc_irqs(domain, dev, nvec);
}
int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
			    int nvec)
{
	ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
	if (ret)
		return ret;

	for_each_msi_entry(desc, dev) {
		ops->set_desc(&arg, desc);

		virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
					       dev_to_node(dev), &arg, false,
					       desc->affinity);
		if (virq < 0) {
			ret = -ENOSPC;
			if (ops->handle_error)
				ret = ops->handle_error(domain, desc, ret);
			if (ops->msi_finish)
				ops->msi_finish(&arg, ret);
			return ret;
		}

		for (i = 0; i < desc->nvec_used; i++) {
			irq_set_msi_desc_off(virq, i, desc);
			irq_debugfs_copy_devname(virq + i, dev);
		}
	}

	if (ops->msi_finish)
		ops->msi_finish(&arg, 0);

	for_each_msi_vector(desc, i, dev) {
		if (desc->irq == i) {
			virq = desc->irq;
			dev_dbg(dev, "irq [%d-%d] for MSI\n",
				virq, virq + desc->nvec_used - 1);
		}

		irq_data = irq_domain_get_irq_data(domain, i);
		if (!can_reserve) {
			irqd_clr_can_reserve(irq_data);
			if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
				irqd_set_msi_nomask_quirk(irq_data);
			if ((info->flags & MSI_FLAG_ACTIVATE_EARLY) &&
				irqd_affinity_is_managed(irq_data) &&
				!cpumask_intersects(irq_data_get_affinity_mask(irq_data),
						    cpu_online_mask)) {
				irqd_set_managed_shutdown(irq_data);
				continue;
			}
		}
		ret = irq_domain_activate_irq(irq_data, can_reserve);
	}
}

这里的主要过程有:

  • msi_domain_prepare_irqs会进入ITS,进行PCIE设备相关映射操作,这里会向ITS添加一个pcie设备到设备查找表
  • __irq_domain_alloc_irqs主要就是分配硬件irq编号,硬件irq编号就是从这里来的,通过irq_domain_alloc_irqs_hierarchy进入ITS分配硬件中断
  • irq_domain_activate_irq会进入ITS,进行irq配置操作,将上面pcie设备的irq配置到ITT表中

下面一个一个看

int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
			    int nvec, msi_alloc_info_t *arg)
{
	ret = ops->msi_check(domain, info, dev);
	if (ret == 0)
		ret = ops->msi_prepare(domain, dev, nvec, arg);

	return ret;
}

msi_prepare由irq-giv-v3-pci-msi.c注册
/drivers/irqchip/irq-giv-v3-pci-msi.c

static int its_pci_msi_prepare(struct irq_domain *domain, struct device *dev,
			       int nvec, msi_alloc_info_t *info)
{
	msi_info = msi_get_domain_info(domain->parent);

	pdev = to_pci_dev(dev);

	pci_for_each_dma_alias(pdev, its_get_pci_alias, &alias_dev);
	if (alias_dev != pdev && alias_dev->subordinate)
		pci_walk_bus(alias_dev->subordinate, its_pci_msi_vec_count,
			     &alias_count);

	/* ITS specific DeviceID, as the core ITS ignores dev. */
	info->scratchpad[0].ul = pci_msi_domain_get_msi_rid(domain, pdev);
	return msi_info->ops->msi_prepare(domain->parent, dev, nvec, info);
}

static struct msi_domain_ops its_pci_msi_ops = {
	.msi_prepare	= its_pci_msi_prepare,
};

上面主要是生成device_id,放入info->scratchpad[0].ul中,msi_prepare调用到irq-giv-v3-msi.c中
/drivers/irqchip/irq-giv-v3-its.c

static int its_msi_prepare(struct irq_domain *domain, struct device *dev,
			   int nvec, msi_alloc_info_t *info)
{
	its_dev = its_create_device(its, dev_id, nvec, true);
}

static struct msi_domain_ops its_msi_domain_ops = {
	.msi_prepare	= its_msi_prepare,
};
static struct its_device *its_create_device(struct its_node *its, u32 dev_id,
					    int nvecs, bool alloc_lpis)
{
	struct its_device *dev;
	unsigned long *lpi_map = NULL;
	unsigned long flags;
	u16 *col_map = NULL;
	void *itt;
	int lpi_base;
	int nr_lpis;
	int nr_ites;
	int sz;
	gfp_t gfp_flags;

	if (!its_alloc_device_table(its, dev_id))
		return NULL;

	if (WARN_ON(!is_power_of_2(nvecs)))
		nvecs = roundup_pow_of_two(nvecs);

	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
	/*
	 * Even if the device wants a single LPI, the ITT must be
	 * sized as a power of two (and you need at least one bit...).
	 */
	nr_ites = max(2, nvecs);
	sz = nr_ites * (FIELD_GET(GITS_TYPER_ITT_ENTRY_SIZE, its->typer) + 1);
	sz = max(sz, ITS_ITT_ALIGN) + ITS_ITT_ALIGN - 1;
	gfp_flags = GFP_KERNEL;
	if (of_machine_is_compatible("rockchip,rk3568") || of_machine_is_compatible("rockchip,rk3566")) {
		gfp_flags |= GFP_DMA32;
		itt = (void *)__get_free_pages(gfp_flags, get_order(sz));
	} else {
		itt = kzalloc_node(sz, gfp_flags, its->numa_node);
	}

	if (alloc_lpis) {
		lpi_map = its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);
		if (lpi_map)
			col_map = kcalloc(nr_lpis, sizeof(*col_map),
					  GFP_KERNEL);
	} else {
		col_map = kcalloc(nr_ites, sizeof(*col_map), GFP_KERNEL);
		nr_lpis = 0;
		lpi_base = 0;
	}

	if (!dev || !itt ||  !col_map || (!lpi_map && alloc_lpis)) {
		kfree(dev);

		if (of_machine_is_compatible("rockchip,rk3568") ||
		    of_machine_is_compatible("rockchip,rk3566"))
			free_pages((unsigned long)itt, get_order(sz));
		else
			kfree(itt);

		kfree(lpi_map);
		kfree(col_map);
		return NULL;
	}

	gic_flush_dcache_to_poc(itt, sz);

	dev->its = its;
	dev->itt = itt;
	dev->itt_sz = sz;
	dev->nr_ites = nr_ites;
	dev->event_map.lpi_map = lpi_map;
	dev->event_map.col_map = col_map;
	dev->event_map.lpi_base = lpi_base;
	dev->event_map.nr_lpis = nr_lpis;
	raw_spin_lock_init(&dev->event_map.vlpi_lock);
	dev->device_id = dev_id;
	INIT_LIST_HEAD(&dev->entry);

	raw_spin_lock_irqsave(&its->lock, flags);
	list_add(&dev->entry, &its->its_device_list);
	raw_spin_unlock_irqrestore(&its->lock, flags);

	/* Map device to its ITT */
	its_send_mapd(dev, 1);

	return dev;
}

这里主要是构造ITS命令,最后通过its_send_mapd发送到硬件执行,具体过程参见GIC v3 详解
这里面有一个关键的地方需要注意its_lpi_alloc(nvecs, &lpi_base, &nr_lpis);lpi_base在这里分配,他决定了这个设备的event_id和后面hwirq的生成编号。我们来看一下生成算法:

struct lpi_range {
	struct list_head	entry;
	u32			base_id;
	u32			span;
};

static int alloc_lpi_range(u32 nr_lpis, u32 *base)
{
	struct lpi_range *range, *tmp;
	int err = -ENOSPC;

	mutex_lock(&lpi_range_lock);

	list_for_each_entry_safe(range, tmp, &lpi_range_list, entry) {
		if (range->span >= nr_lpis) {
			*base = range->base_id;
			range->base_id += nr_lpis;
			range->span -= nr_lpis;

			if (range->span == 0) {
				list_del(&range->entry);
				kfree(range);
			}

			err = 0;
			break;
		}
	}

	mutex_unlock(&lpi_range_lock);

	pr_debug("ITS: alloc %u:%u\n", *base, nr_lpis);
	return err;
}

static unsigned long *its_lpi_alloc(int nr_irqs, u32 *base, int *nr_ids)
{
	unsigned long *bitmap = NULL;
	int err = 0;

	do {
		err = alloc_lpi_range(nr_irqs, base);
		if (!err)
			break;

		nr_irqs /= 2;
	} while (nr_irqs > 0);

	if (!nr_irqs)
		err = -ENOSPC;

	if (err)
		goto out;

	bitmap = kcalloc(BITS_TO_LONGS(nr_irqs), sizeof (long), GFP_ATOMIC);
	if (!bitmap)
		goto out;

	*nr_ids = nr_irqs;

out:
	if (!bitmap)
		*base = *nr_ids = 0;

	return bitmap;
}

这里是通过alloc_lpi_range来分配lpi_base的,从lpi_range链表中找到第一个合适的值,这里lpi_range主要用于管理lpi_base的值分配,回收,合并。这里乍一看lpi_range没有地方初始化,其实是通过free操作向lpi_range链表中插入一个range来初始化的

static int free_lpi_range(u32 base, u32 nr_lpis)
{
	struct lpi_range *new, *old;

	new = mk_lpi_range(base, nr_lpis);
	if (!new)
		return -ENOMEM;

	mutex_lock(&lpi_range_lock);

	list_for_each_entry_reverse(old, &lpi_range_list, entry) {
		if (old->base_id < base)
			break;
	}
	/*
	 * old is the last element with ->base_id smaller than base,
	 * so new goes right after it. If there are no elements with
	 * ->base_id smaller than base, &old->entry ends up pointing
	 * at the head of the list, and inserting new it the start of
	 * the list is the right thing to do in that case as well.
	 */
	list_add(&new->entry, &old->entry);
	/*
	 * Now check if we can merge with the preceding and/or
	 * following ranges.
	 */
	merge_lpi_ranges(old, new);
	merge_lpi_ranges(new, list_next_entry(new, entry));

	mutex_unlock(&lpi_range_lock);
	return 0;
}
static int __init its_lpi_init(u32 id_bits)
{
	u32 lpis = (1UL << id_bits) - 8192;
	u32 numlpis;
	int err;

	numlpis = 1UL << GICD_TYPER_NUM_LPIS(gic_rdists->gicd_typer);

	if (numlpis > 2 && !WARN_ON(numlpis > lpis)) {
		lpis = numlpis;
		pr_info("ITS: Using hypervisor restricted LPI range [%u]\n",
			lpis);
	}

	/*
	 * Initializing the allocator is just the same as freeing the
	 * full range of LPIs.
	 */
	err = free_lpi_range(8192, lpis);
	pr_debug("ITS: Allocator initialized for %u LPIs\n", lpis);
	return err;
}

可以看到在lpi初始化的时候就调用free_lpi_range向lpi_range链表中插入了第一个range,起始值是8192,数量根据gic能力计算,之后都是从这里面分配。而lpi_base就是当前设备hwirq的起始值,之后的值都是往后递增。

下面看一下__irq_domain_alloc_irqs

int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
			    unsigned int nr_irqs, int node, void *arg,
			    bool realloc, const struct irq_affinity_desc *affinity)
{
	int i, ret, virq;

	if (domain == NULL) {
		domain = irq_default_domain;
		if (WARN(!domain, "domain is NULL; cannot allocate IRQ\n"))
			return -EINVAL;
	}

	if (realloc && irq_base >= 0) {
		virq = irq_base;
	} else {
		virq = irq_domain_alloc_descs(irq_base, nr_irqs, 0, node,
					      affinity);
		if (virq < 0) {
			pr_debug("cannot allocate IRQ(base %d, count %d)\n",
				 irq_base, nr_irqs);
			return virq;
		}
	}

	if (irq_domain_alloc_irq_data(domain, virq, nr_irqs)) {
		pr_debug("cannot allocate memory for IRQ%d\n", virq);
		ret = -ENOMEM;
		goto out_free_desc;
	}

	mutex_lock(&irq_domain_mutex);
	ret = irq_domain_alloc_irqs_hierarchy(domain, virq, nr_irqs, arg);
	if (ret < 0) {
		mutex_unlock(&irq_domain_mutex);
		goto out_free_irq_data;
	}

	for (i = 0; i < nr_irqs; i++) {
		ret = irq_domain_trim_hierarchy(virq + i);
		if (ret) {
			mutex_unlock(&irq_domain_mutex);
			goto out_free_irq_data;
		}
	}
	
	for (i = 0; i < nr_irqs; i++)
		irq_domain_insert_irq(virq + i);
	mutex_unlock(&irq_domain_mutex);

	return virq;

out_free_irq_data:
	irq_domain_free_irq_data(virq, nr_irqs);
out_free_desc:
	irq_free_descs(virq, nr_irqs);
	return ret;
}

我们主要跟踪irq_domain_alloc_irqs_hierarchy

int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain,
				    unsigned int irq_base,
				    unsigned int nr_irqs, void *arg)
{
	if (!domain->ops->alloc) {
		pr_debug("domain->ops->alloc() is NULL\n");
		return -ENOSYS;
	}

	return domain->ops->alloc(domain, irq_base, nr_irqs, arg);
}

下面进入ITS
drivers/irqchip/irq-gic-v3-its.c

static int its_irq_domain_alloc(struct irq_domain *domain, unsigned int virq,
				unsigned int nr_irqs, void *args)
{
	msi_alloc_info_t *info = args;
	struct its_device *its_dev = info->scratchpad[0].ptr;
	struct its_node *its = its_dev->its;
	struct irq_data *irqd;
	irq_hw_number_t hwirq;
	int err;
	int i;

	err = its_alloc_device_irq(its_dev, nr_irqs, &hwirq);
	if (err)
		return err;

	err = iommu_dma_prepare_msi(info->desc, its->get_msi_base(its_dev));
	if (err)
		return err;

	for (i = 0; i < nr_irqs; i++) {
		err = its_irq_gic_domain_alloc(domain, virq + i, hwirq + i);
		if (err)
			return err;

		irq_domain_set_hwirq_and_chip(domain, virq + i,
					      hwirq + i, &its_irq_chip, its_dev);
		irqd = irq_get_irq_data(virq + i);
		irqd_set_single_target(irqd);
		irqd_set_affinity_on_activate(irqd);
		pr_debug("ID:%d pID:%d vID:%d\n",
			 (int)(hwirq + i - its_dev->event_map.lpi_base),
			 (int)(hwirq + i), virq + i);
	}

	return 0;
}
static const struct irq_domain_ops its_domain_ops = {
	.alloc			= its_irq_domain_alloc,
	.free			= its_irq_domain_free,
	.activate		= its_irq_domain_activate,
	.deactivate		= its_irq_domain_deactivate,
};

上面代码中第一个函数its_alloc_device_irq就是真正分配hwirq的地方

static int its_alloc_device_irq(struct its_device *dev, int nvecs, irq_hw_number_t *hwirq)
{
	int idx;

	/* Find a free LPI region in lpi_map and allocate them. */
	idx = bitmap_find_free_region(dev->event_map.lpi_map,
				      dev->event_map.nr_lpis,
				      get_count_order(nvecs));
	if (idx < 0)
		return -ENOSPC;

	*hwirq = dev->event_map.lpi_base + idx;

	return 0;
}

可以看到hwirq实际上等于lpi_base + 序号,也印证了之前说的lpi_base就是当前设备的起始hwirq

好,我们回过头来看一下irq_domain_activate_irq

static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve)
{
	int ret = 0;

	if (irqd && irqd->domain) {
		struct irq_domain *domain = irqd->domain;

		if (irqd->parent_data)
			ret = __irq_domain_activate_irq(irqd->parent_data,
							reserve);
		if (!ret && domain->ops->activate) {
			ret = domain->ops->activate(domain, irqd, reserve);
			/* Rollback in case of error */
			if (ret && irqd->parent_data)
				__irq_domain_deactivate_irq(irqd->parent_data);
		}
	}
	return ret;
}

int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve)
{
	int ret = 0;

	if (!irqd_is_activated(irq_data))
		ret = __irq_domain_activate_irq(irq_data, reserve);
	if (!ret)
		irqd_set_activated(irq_data);
	return ret;
}

上面的关键点在domain->ops->activate这一句,这里进入到ITS进行中断配置。要注意__irq_domain_activate_irq这个函数,他会级联调用父节点的active,从设备树角度来看pci irq domain挂在了its irq domain上,所以这里先active了its domain,然后active pci domain。按顺序先看its irq domain:

static inline u32 its_get_event_id(struct irq_data *d)
{
	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
	return d->hwirq - its_dev->event_map.lpi_base;
}

static int its_irq_domain_activate(struct irq_domain *domain,
				   struct irq_data *d, bool reserve)
{
	struct its_device *its_dev = irq_data_get_irq_chip_data(d);
	u32 event = its_get_event_id(d);
	int cpu;

	cpu = its_select_cpu(d, cpu_online_mask);
	if (cpu < 0 || cpu >= nr_cpu_ids)
		return -EINVAL;

	its_inc_lpi_count(d, cpu);
	its_dev->event_map.col_map[event] = cpu;
	irq_data_update_effective_affinity(d, cpumask_of(cpu));

	/* Map the GIC IRQ and event to the device */
	its_send_mapti(its_dev, d->hwirq, event);
	return 0;
}
static const struct irq_domain_ops its_domain_ops = {
	.alloc			= its_irq_domain_alloc,
	.free			= its_irq_domain_free,
	.activate		= its_irq_domain_activate,
	.deactivate		= its_irq_domain_deactivate,
};

也是构造命令,最后通过its_send_mapti发送给硬件执行。可以看到event id实际上就是当前设备的第几个hwirq,是当前设备irq的编号

下面看一下pci domain的逻辑,上面its domain将中断信息配置进了its,下面就会将同样的中断信息写入pci bar:
kernel/irq/msi.c

static inline void irq_chip_write_msi_msg(struct irq_data *data,
					  struct msi_msg *msg)
{
	data->chip->irq_write_msi_msg(data, msg);
}

static int msi_domain_activate(struct irq_domain *domain,
			       struct irq_data *irq_data, bool early)
{
	struct msi_msg msg[2] = { [1] = { }, };

	BUG_ON(irq_chip_compose_msi_msg(irq_data, msg));
	msi_check_level(irq_data->domain, msg);
	irq_chip_write_msi_msg(irq_data, msg);
	return 0;
}

static const struct irq_domain_ops msi_domain_ops = {
	.alloc		= msi_domain_alloc,
	.free		= msi_domain_free,
	.activate	= msi_domain_activate,
	.deactivate	= msi_domain_deactivate,
};

drivers/pci/msi.c

static void pci_msi_domain_update_chip_ops(struct msi_domain_info *info)
{
	struct irq_chip *chip = info->chip;

	BUG_ON(!chip);
	if (!chip->irq_write_msi_msg)
		chip->irq_write_msi_msg = pci_msi_domain_write_msg;
	if (!chip->irq_mask)
		chip->irq_mask = pci_msi_mask_irq;
	if (!chip->irq_unmask)
		chip->irq_unmask = pci_msi_unmask_irq;
}
void pci_msi_domain_write_msg(struct irq_data *irq_data, struct msi_msg *msg)
{
	struct msi_desc *desc = irq_data_get_msi_desc(irq_data);

	/*
	 * For MSI-X desc->irq is always equal to irq_data->irq. For
	 * MSI only the first interrupt of MULTI MSI passes the test.
	 */
	if (desc->irq == irq_data->irq)
		__pci_write_msi_msg(desc, msg);
}
void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
{
	struct pci_dev *dev = msi_desc_to_pci_dev(entry);

	if (dev->current_state != PCI_D0 || pci_dev_is_disconnected(dev)) {
		/* Don't touch the hardware now */
	} else if (entry->msi_attrib.is_msix) {
		void __iomem *base = pci_msix_desc_addr(entry);
		bool unmasked = !(entry->masked & PCI_MSIX_ENTRY_CTRL_MASKBIT);

		if (!base)
			goto skip;

		/*
		 * The specification mandates that the entry is masked
		 * when the message is modified:
		 *
		 * "If software changes the Address or Data value of an
		 * entry while the entry is unmasked, the result is
		 * undefined."
		 */
		if (unmasked)
			__pci_msix_desc_mask_irq(entry, PCI_MSIX_ENTRY_CTRL_MASKBIT);

		writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
		writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
		writel(msg->data, base + PCI_MSIX_ENTRY_DATA);

		if (unmasked)
			__pci_msix_desc_mask_irq(entry, 0);

		/* Ensure that the writes are visible in the device */
		readl(base + PCI_MSIX_ENTRY_DATA);
	} else {
		int pos = dev->msi_cap;
		u16 msgctl;

		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
		msgctl &= ~PCI_MSI_FLAGS_QSIZE;
		msgctl |= entry->msi_attrib.multiple << 4;
		pci_write_config_word(dev, pos + PCI_MSI_FLAGS, msgctl);

		pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_LO,
				       msg->address_lo);
		if (entry->msi_attrib.is_64) {
			pci_write_config_dword(dev, pos + PCI_MSI_ADDRESS_HI,
					       msg->address_hi);
			pci_write_config_word(dev, pos + PCI_MSI_DATA_64,
					      msg->data);
		} else {
			pci_write_config_word(dev, pos + PCI_MSI_DATA_32,
					      msg->data);
		}
		/* Ensure that the writes are visible in the device */
		pci_read_config_word(dev, pos + PCI_MSI_FLAGS, &msgctl);
	}

skip:
	entry->msg = *msg;

	if (entry->write_msi_msg)
		entry->write_msi_msg(entry, entry->write_msi_msg_data);

}

这里最后就吧之前保存的irq信息写入了pci bar,所有流程到此结束,over

<think>好的,用户现在想了解如何在PCIe设备中配置和使用MSI-X中断。我需要结合之前的回答和提供的引用来给出详细的步骤和解释。首先,回顾一下之前的回答,用户之前询问的是如何获取MSI中断的个数,我详细介绍了内核API、PCI配置空间解析、运行时统计等方法,并提到了驱动开发的注意事项。现在用户转向MSI-X的配置和使用,需要确保覆盖配置流程、与MSI的区别以及实际代码示例。 首先,我需要明确MSIMSI-X的主要区别。根据引用[3],MSI要求中断向量号连续,而MSI-X允许非连续,这可能是用户关心的一个点。同时,引用[2]提到MSI-X是为了替代INTx中断,降低硬件复杂度,所以可能需要说明MSI-X的优势。 接下来,配置MSI-X的步骤应包括初始化、分配中断向量、设置中断处理函数、配置设备寄存器等。需要参考引用[1]中提到的MSI-X capability结构,特别是Table BIR和Table Offset,这些决定了MSI-X表的位置。在代码示例中,可能需要展示如何读取这些寄存器,并访问MSI-X表。 此外,用户可能需要了解如何在驱动中实现这些步骤,比如使用内核API如pci_enable_msix()或pci_alloc_irq_vectors(),以及处理可能出现的错误情况。还要注意与之前提到的MSI配置的不同之处,比如MSI-X支持更多的中断向量,并且每个向量可以独立配置地址和数据。 还要考虑用户的实际应用场景,比如多队列网卡或高性能存储设备,这些通常使用MSI-X来实现更好的中断负载均衡。因此,在优化技巧部分可以提到设置中断亲和性,确保中断分配到不同的CPU核心,提升性能。 最后,需要验证MSI-X是否成功启用,比如通过检查/proc/interrupts中的条目,或者在内核日志中查看相关信息。同时,要提醒用户注意设备的限制,比如支持的MSI-X向量数量,以及可能存在的硬件问题,如引用[2]提到的MSI缺陷处理。 总结来说,回答需要结构清晰,分步骤讲解配置流程,结合代码示例和实际注意事项,同时对比MSIMSI-X的区别,确保用户能全面理解如何在PCIe设备中使用MSI-X中断。</think>#### 一、MSI-X中断机制概述 MSI-X是PCIe设备通过写入特定内存地址(Message Address)并携带数据(Message Data)来触发中断的机制,相比MSI具有以下优势: 1. 支持非连续中断向量号(MSI要求连续)[^3] 2. 允许更多中断向量(MSI最多32个,MSI-X可达2048个) 3. 每个中断可独立设置目标地址和数据 #### 二、MSI-X配置流程详解 1. **检查设备能力**: ```c // 检查MSI-X支持 if (!pci_find_capability(pdev, PCI_CAP_ID_MSIX)) { dev_err(&pdev->dev, "设备不支持MSI-X\n"); return -ENODEV; } ``` 2. **分配中断向量**: ```c int nvec = 8; // 请求8个MSI-X中断 struct msix_entry entries[8]; for (int i=0; i<nvec; i++) entries[i].entry = i; int ret = pci_enable_msix_range(pdev, entries, 1, nvec); if (ret < 0) { dev_err(&pdev->dev, "MSI-X分配失败,错误码:%d\n", ret); return ret; } int actual_nvec = ret; // 实际分配数量 ``` - `pci_enable_msix_range()`返回实际分配的向量数[^1] 3. **配置MSI-X表**: ```c // 获取MSI-X表物理地址 void __iomem *msix_table; u32 table_offset; u8 bir; // BAR指示寄存器 pci_read_config_dword(pdev, pdev->msix_cap + PCI_MSIX_TABLE, &table_offset); bir = (table_offset & PCI_MSIX_TABLE_BIR); // 从引用[1]提取BIR table_offset &= ~PCI_MSIX_TABLE_BIR; // 映射设备内存空间 msix_table = pci_iomap_range(pdev, bir, table_offset, actual_nvec * 16); ``` 4. **设置中断处理**: ```c for (int i=0; i<actual_nvec; i++) { request_irq(pci_irq_vector(pdev, i), handler, 0, "my_device", dev); } ``` #### 三、关键数据结构配置 每个MSI-X表条目包含: ```c struct msix_entry { u32 msg_address_lo; // 低32位地址 u32 msg_address_hi; // 高32位地址(64位系统) u32 msg_data; // 中断消息数据 u32 vector_control; // 掩码位(bit0) }; ``` - **地址配置**:需设置为CPU的MSI-X接收地址(通常通过`pci_msix_table_addr()`获取) - **数据配置**:对应中断向量号(需与APIC配置匹配) #### 四、驱动开发关键步骤 1. **初始化流程**: ```c pci_enable_device(pdev); pci_request_regions(pdev, "my_device"); pci_set_master(pdev); if (pci_enable_msix_exact(pdev, entries, nvec)) { // 回退到MSI或传统IRQ } ``` 2. **中断处理示例**: ```c irqreturn_t handler(int irq, void *dev_id) { struct my_device *dev = dev_id; int vec = irq - pci_irq_vector(dev->pdev, 0); // 处理特定向量中断 if (vec == TX_COMPLETE_VEC) { handle_tx_completion(dev); } return IRQ_HANDLED; } ``` 3. **多队列设备配置**: ```c // 设置不同向量到不同CPU核心 for (int i=0; i<actual_nvec; i++) { irq_set_affinity_hint(pci_irq_vector(pdev, i), cpumask_of(i % num_online_cpus())); } ``` #### 五、调试与验证 1. **查看/proc/interrupts**: ```bash $ cat /proc/interrupts | grep MSI-X 107: 0 0 0 0 IR-PCI-MSI-X 327680-edge my_device-vec0 108: 0 0 0 0 IR-PCI-MSI-X 327681-edge my_device-vec1 ``` 2. **内核日志检查**: ```dmesg [ 2.485073] my_device 0000:03:00.0: Enabled 8 MSI-X interrupts ``` 3. **性能监控工具**: ```bash $ perf stat -e irq_vectors:local_timer_entry,irq_vectors:my_device_irq ``` #### 六、与MSI的差异处理 | 特性 | MSI | MSI-X | |--------------------|-------------------------|-------------------------| | 最大中断数 | 32 | 2048 | | 向量连续性 | 必须连续[^3] | 可非连续 | | 地址配置 | 单个基地址 | 每个中断独立地址 | | 表存储位置 | 配置空间 | 设备内存空间[^1] | #### 七、常见问题解决 1. **中断未触发**: - 验证MSI-X表地址是否正确映射 - 检查`vector_control`是否未设置掩码位 - 确认设备DMA空间已使能 2. **分配数量不足**: ```c // 动态调整请求量 for (nvec = 8; nvec >=1; nvec/=2) { if (pci_enable_msix_range(pdev, entries, 1, nvec) > 0) break; } ``` 3. **跨架构兼容**: ```c #ifdef CONFIG_X86 // x86特定配置 #elif defined(CONFIG_ARM64) // 验证GICv3+支持 #endif ```
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值