1,PCI总线介绍
在PC时代的早期,外部设备通过ISA总线接入计算机。ISA总线只有24根地址线,因此其上的外部设备只能访问内存的低16M地址空间。这种硬件上的限制对于如今需要大量IO memory的外设来说是不可接受的。PCI总线的出现,就是为了解决ISA的这些先天缺陷而提出。
PCI设计使用32位地址线,使得PCI设备可以映射到32位内存空间的任意位置。PCI最大的优势在于,设计了各设备独立的配置空间:使得各个设备在系统加电过程中就分配好自身所需的IO内存,中断irq等资源(当然,也可以在系统运行过程中动态指定资源,但通常不必这么做)。这样做的好处是,各PCI设备所需的资源可以通过跟PCI控制器协商动态确定,驱动不必关心资源的分配和管理。PCI配置空间长度为256字节,前64字节是协议固定的,后面的192个字节预留给厂商定义。下图(图片来源于网络),展示了PCI配置空间的前64bit定义:
其中,DeviceID, VendorID, class, SubsystemID, SubsystemVendorID表示设备ID,厂商ID,设备类,子设备ID,子厂商ID等,用于标识一个PCI设备。Base Address Register [0-5] 表示设备的基址。InterruptPin和InterruptLine表示设备中断引脚和中断irq。
2,PCI设备的寻址
那么,我们如何表示一个特定的PCI设备呢?PCI使用32位数字标识一个设备,通过 域:总线:设备:功能 的划分来表示。域,一般占用16位;总线,一般占用8位;设备一般占用5位;功能,一般占用3位。因此,一个PCI总线通常可以支持32个设备,一个设备通常可以支持8个功能。
3,Linux下PCI设备驱动框架
PCI总线设备驱动框架遵循linux的总线/设备/驱动框架模型。使用pci_bus表示一条PCI总线,pci_dev表示一个pci设备,pci_driver表示一个pci驱动。通常,在系统启动过程中,会扫描系统内所有的pci设备,并加入pci_bus。pci_driver在注册到pci_bus的过程中,会扫描pci_bus下挂的pci_dev,并匹配关联相关的pci设备。
PCI总线的本质是提供了设备资源(IO memory/IRQ等)的动态分配。驱动可以通过PCI控制器获取设备的IO memory,irq等设备资源,但仍然需要定义字符设备/块设备,或者sysfs接口(用于向上提供用户访问接口)以及提供设备特定的操作方法(用来实现设备特定的功能)。一个标准的PCI设备驱动,需要同时包含一个字符设备(块设备,sysfs等),和一个pci_driver。pci_driver在注册的过程中,匹配到相应的pci_dev后,通常调用pci_driver的probe()方法来初始化设备特定的资源(IO memory/ irq等)。用户通过字符设备(块设备,sysfs等)获取到该pci设备后,通过初始化好的设备资源,就可以跟设备进行通信,从而实现设备的特定功能。
数据结构:
(1)struct pci_bus:
struct pci_bus {
struct list_head node; /* node in list of buses */ // 总线链表节点
struct pci_bus *parent; /* parent bus this bridge is on */ // 父总线节点
struct list_head children; /* list of child buses */ // 子总线链表
struct list_head devices; /* list of devices on this bus */ // 总线下挂设备链表
struct pci_dev *self; /* bridge device as seen by parent */
struct list_head slots; /* list of slots on this bus;
protected by pci_slot_mutex */
struct resource *resource[PCI_BRIDGE_RESOURCE_NUM]; // 桥包含的资源
struct list_head resources; /* address space routed to this bus */ // 总线的地址空间
struct resource busn_res; /* bus numbers routed to this bus */
struct pci_ops *ops; /* configuration access functions */ // 配置空间访问方法
struct msi_controller *msi; /* MSI controller */
void *sysdata; /* hook for sys-specific extension */
struct proc_dir_entry *procdir; /* directory entry in /proc/bus/pci */
unsigned char number; /* bus number */ // 总线编号
unsigned char primary; /* number of primary bridge */
unsigned char max_bus_speed; /* enum pci_bus_speed */
unsigned char cur_bus_speed; /* enum pci_bus_speed */
#ifdef CONFIG_PCI_DOMAINS_GENERIC
int domain_nr;
#endif
char name[48];
unsigned short bridge_ctl; /* manage NO_ISA/FBB/et al behaviors */
pci_bus_flags_t bus_flags; /* inherited by child buses */
struct device *bridge;
struct device dev; // 设备模型框架,总线也是一个设备
struct bin_attribute *legacy_io; /* legacy I/O for this bus */
struct bin_attribute *legacy_mem; /* legacy mem */
unsigned int is_added:1;
};
(2)struct pci_dev
struct pci_dev {
struct list_head bus_list; /* node in per-bus list */ // 总线列表节点
struct pci_bus *bus; /* bus this device is on */ // 设备所在总线
struct pci_bus *subordinate; /* bus this device bridges to */
void *sysdata; /* hook for sys-specific extension */
struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */
struct pci_slot *slot; /* Physical slot this device is in */
unsigned int devfn; /* encoded device & function index */ // pci设备和功能组合标号
unsigned short vendor; // 厂商
unsigned short device; // 设备号
unsigned short subsystem_vendor; // 子厂商编号
unsigned short subsystem_device; // 子设备标号
unsigned int class; /* 3 bytes: (base,sub,prog-if) */ // 类别
u8 revision; /* PCI revision, low byte of class word */
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
u8 pcie_cap; /* PCIe capability offset */
u8 msi_cap; /* MSI capability offset */
u8 msix_cap; /* MSI-X capability offset */
u8 pcie_mpss:3; /* PCIe Max Payload Size Supported */
u8 rom_base_reg; /* which config register controls the ROM */
u8 pin; /* which interrupt pin this device uses */
u16 pcie_flags_reg; /* cached PCIe Capabilities Register */
u8 dma_alias_devfn;/* devfn of DMA alias, if any */
struct pci_driver *driver; /* which driver has allocated this device */ // 设备所属驱动
u64 dma_mask; /* Mask of the bits of bus address this
device implements. Normally this is
0xffffffff. You only need to change
this if your device has broken DMA
or supports 64-bit transfers. */
struct device_dma_parameters dma_parms;
pci_power_t current_state; /* Current operating state. In ACPI-speak,
this is D0-D3, D0 being fully functional,
and D3 being off. */
u8 pm_cap; /* PM capability offset */
unsigned int pme_support:5; /* Bitmask of states from which PME#
can be generated */
unsigned int pme_interrupt:1;
unsigned int pme_poll:1; /* Poll device's PME status bit */
unsigned int d1_support:1; /* Low power state D1 is supported */
unsigned int d2_support:1; /* Low power state D2 is supported */
unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
unsigned int no_d3cold:1; /* D3cold is forbidden */
unsigned int d3cold_allowed:1; /* D3cold is allowed by user */
unsigned int mmio_always_on:1; /* disallow turning off io/mem
decoding during bar sizing */
unsigned int wakeup_prepared:1;
unsigned int runtime_d3cold:1; /* whether go through runtime
D3cold, not set for devices
powered on/off by the
corresponding bridge */
unsigned int ignore_hotplug:1; /* Ignore hotplug events */
unsigned int d3_delay; /* D3->D0 transition time in ms */
unsigned int d3cold_delay; /* D3cold->D0 transition time in ms */
#ifdef CONFIG_PCIEASPM
struct pcie_link_state *link_state; /* ASPM link state */
#endif
pci_channel_state_t error_state; /* current connectivity state */
struct device dev; /* Generic device interface */ // 设备模型框架
int cfg_size; /* Size of configuration space */
/*
* Instead of touching interrupt line and base address registers
* directly, use the values stored here. They might be different!
*/
unsigned int irq; // pci控制器分配的设备使用的 irq
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */ // 设备的IO memory
bool match_driver; /* Skip attaching driver */
/* These fields are used by common fixups */
unsigned int transparent:1; /* Subtractive decode PCI bridge */
unsigned int multifunction:1;/* Part of multi-function device */
/* keep track of device state */
unsigned int is_added:1;
unsigned int is_busmaster:1; /* device is busmaster */
unsigned int no_msi:1; /* device may not use msi */
unsigned int no_64bit_msi:1; /* device may only use 32-bit MSIs */
unsigned int block_cfg_access:1; /* config space access is blocked */
unsigned int broken_parity_status:1; /* Device generates false positive parity */
unsigned int irq_reroute_variant:2; /* device needs IRQ rerouting variant */
unsigned int msi_enabled:1;
unsigned int msix_enabled:1;
unsigned int ari_enabled:1; /* ARI forwarding */
unsigned int ats_enabled:1; /* Address Translation Service */
unsigned int is_managed:1;
unsigned int needs_freset:1; /* Dev requires fundamental reset */
unsigned int state_saved:1;
unsigned int is_physfn:1;
unsigned int is_virtfn:1;
unsigned int reset_fn:1;
unsigned int is_hotplug_bridge:1;
unsigned int __aer_firmware_first_valid:1;
unsigned int __aer_firmware_first:1;
unsigned int broken_intx_masking:1;
unsigned int io_window_1k:1; /* Intel P2P bridge 1K I/O windows */
unsigned int irq_managed:1;
unsigned int has_secondary_link:1;
pci_dev_flags_t dev_flags;
atomic_t enable_cnt; /* pci_enable_device has been called */
u32 saved_config_space[16]; /* config space saved at suspend time */
struct hlist_head saved_cap_space;
struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */
int rom_attr_enabled; /* has display of the rom attribute been enabled? */
struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
#ifdef CONFIG_PCI_MSI
const struct attribute_group **msi_irq_groups;
#endif
struct pci_vpd *vpd;
#ifdef CONFIG_PCI_ATS
union {
struct pci_sriov *sriov; /* SR-IOV capability related */
struct pci_dev *physfn; /* the PF this VF is associated with */
};
u16 ats_cap; /* ATS Capability offset */
u8 ats_stu; /* ATS Smallest Translation Unit */
atomic_t ats_ref_cnt; /* number of VFs with ATS enabled */
#endif
phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */
size_t romlen; /* Length of ROM if it's not from the BAR */
char *driver_override; /* Driver name to force a match */
};
(3)struct pci_driver
struct pci_driver {
struct list_head node; // 挂接到总线的链表节点
const char *name; // 驱动名称
const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */ // 驱动所支持的pci设备ID
int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */ // 探测函数,通常在该函数中分配字符设备/块设备等(提供用户访问接口),并从pci_dev中获取设备资源,初始化特定设备的结构等。
void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */ // probe的逆函数
int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */
int (*suspend_late) (struct pci_dev *dev, pm_message_t state);
int (*resume_early) (struct pci_dev *dev);
int (*resume) (struct pci_dev *dev); /* Device woken up */
void (*shutdown) (struct pci_dev *dev);
int (*sriov_configure) (struct pci_dev *dev, int num_vfs); /* PF pdev */
const struct pci_error_handlers *err_handler;
struct device_driver driver; // 设备模型架构,用于注册到总线
struct pci_dynids dynids;
};
框架架构:
(1)PCI设备的遍历
Linux系统在启动过程中,会创建pci_bus根节点,并扫描由bios等创建好的所有pci_dev,加入pci_bus,构建pci_bus树(pci设备类型可能是bridge等)。以x86 lagacy为例,pci控制器有两个32位端口:控制端口0xCF8,数据端口0xCFC。
int __init pci_legacy_init(void)
{
if (!raw_pci_ops) {
printk("PCI: System does not support PCI\n");
return 0;
}
printk("PCI: Probing PCI hardware\n");
pcibios_scan_root(0); // 扫描构建pci设备设备树(包括根总线,桥,以及总线下设备等)
return 0;
}
void pcibios_scan_root(int busnum)
{
struct pci_bus *bus;
struct pci_sysdata *sd;
LIST_HEAD(resources);
sd = kzalloc(sizeof(*sd), GFP_KERNEL);
if (!sd) {
printk(KERN_ERR "PCI: OOM, skipping PCI bus %02x\n", busnum);
return;
}
sd->node = x86_pci_root_bus_node(busnum);
x86_pci_root_bus_resources(busnum, &resources);
printk(KERN_DEBUG "PCI: Probing PCI hardware (bus %02x)\n", busnum);
bus = pci_scan_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); // 扫描pci根总线及子总线,总线下挂设备可能是桥(子总线)和设备
if (!bus) {
pci_free_resource_list(&resources);
kfree(sd);
return;
}
pci_bus_add_devices(bus); // 真正的pci设备注册。
}
struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
struct pci_ops *ops, void *sysdata, struct list_head *resources)
{
return pci_scan_root_bus_msi(parent, bus, ops, sysdata, resources,
NULL);
}
struct pci_bus *pci_scan_root_bus_msi(struct device *parent, int bus,
struct pci_ops *ops, void *sysdata,
struct list_head *resources, struct msi_controller *msi)
{
struct resource_entry *window;
bool found = false;
struct pci_bus *b;
int max;
resource_list_for_each_entry(window, resources)
if (window->res->flags & IORESOURCE_BUS) {
found = true;
break;
}
b = pci_create_root_bus(parent, bus, ops, sysdata, resources); // 创建pci根总线节点
if (!b)
return NULL;
b->msi = msi;
if (!found) {
dev_info(&b->dev,
"No busn resource found for root bus, will use [bus %02x-ff]\n",
bus);
pci_bus_insert_busn_res(b, bus, 255);
}
max = pci_scan_child_bus(b); // 扫描根总线节点,创建总线设备。当设备类型是bridge时,创建子总线,并递归扫描创建子总线设备
if (!found)
pci_bus_update_busn_res_end(b, max);
return b;
}
unsigned int pci_scan_child_bus(struct pci_bus *bus)
{
unsigned int devfn, pass, max = bus->busn_res.start;
struct pci_dev *dev;
dev_dbg(&bus->dev, "scanning bus\n");
/* Go find them, Rover! */
for (devfn = 0; devfn < 0x100; devfn += 8)
pci_scan_slot(bus, devfn); // 扫描总线槽位
/* Reserve buses for SR-IOV capability. */
max += pci_iov_bus_range(bus);
/*
* After performing arch-dependent fixup of the bus, look behind
* all PCI-to-PCI bridges on this bus.
*/
if (!bus->is_added) {
dev_dbg(&bus->dev, "fixups for bus\n");
pcibios_fixup_bus(bus);
bus->is_added = 1;
}
for (pass = 0; pass < 2; pass++)
list_for_each_entry(dev, &bus->devices, bus_list) {
if (pci_is_bridge(dev))
max = pci_scan_bridge(bus, dev, max, pass);
} // 扫描总线设备,如果该设备是桥,创建子总线,并递归扫描子总线设备
/*
* We've scanned the bus and so we know all about what's on
* the other side of any bridges that may be on this bus plus
* any devices.
*
* Return how far we've got finding sub-buses.
*/
dev_dbg(&bus->dev, "bus scan returning with max=%02x\n", max);
return max;
}
int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
{
struct pci_bus *child;
int is_cardbus = (dev->hdr_type == PCI_HEADER_TYPE_CARDBUS);
u32 buses, i, j = 0;
u16 bctl;
u8 primary, secondary, subordinate;
int broken = 0;
pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);
primary = buses & 0xFF;
secondary = (buses >> 8) & 0xFF;
subordinate = (buses >> 16) & 0xFF;
dev_dbg(&dev->dev, "scanning [bus %02x-%02x] behind bridge, pass %d\n",
secondary, subordinate, pass);
if (!primary && (primary != bus->number) && secondary && subordinate) {
dev_warn(&dev->dev, "Primary bus is hard wired to 0\n");
primary = bus->number;
}
/* Check if setup is sensible at all */
if (!pass &&
(primary != bus->number || secondary <= bus->number ||
secondary > subordinate)) {
dev_info(&dev->dev, "bridge configuration invalid ([bus %02x-%02x]), reconfiguring\n",
secondary, subordinate);
broken = 1;
}
/* Disable MasterAbortMode during probing to avoid reporting
of bus errors (in some architectures) */
pci_read_config_word(dev, PCI_BRIDGE_CONTROL, &bctl);
pci_write_config_word(dev, PCI_BRIDGE_CONTROL,
bctl & ~PCI_BRIDGE_CTL_MASTER_ABORT);
pci_enable_crs(dev);
if ((secondary || subordinate) && !pcibios_assign_all_busses() &&
!is_cardbus && !broken) {
unsigned int cmax;
/*
* Bus already configured by firmware, process it in the first
* pass and just note the configuration.
*/
if (pass)
goto out;
/*
* The bus might already exist for two reasons: Either we are
* rescanning the bus or the bus is reachable through more than
* one bridge. The second case can happen with the i450NX
* chipset.
*/
child = pci_find_bus(pci_domain_nr(bus), secondary);
if (!child) {
child = pci_add_new_bus(bus, dev, secondary); // 创建子总线
if (!child)
goto out;
child->primary = primary;
pci_bus_insert_busn_res(child, secondary, subordinate); // 子总线加入总线列表
child->bridge_ctl = bctl;
}
cmax = pci_scan_child_bus(child); // 递归扫描子总线设备
if (cmax > subordinate)
dev_warn(&dev->dev, "bridge has subordinate %02x but max busn %02x\n",
subordinate, cmax);
/* subordinate should equal child->busn_res.end */
if (subordinate > max)
max = subordinate;
} else {
/*
* We need to assign a number to this bus which we always
* do in the second pass.
*/
if (!pass) {
if (pcibios_assign_all_busses() || broken || is_cardbus)
/* Temporarily disable forwarding of the
configuration cycles on all bridges in
this bus segment to avoid possible
conflicts in the second pass between two
bridges programmed with overlapping
bus ranges. */
pci_write_config_dword(dev, PCI_PRIMARY_BUS,
buses & ~0xffffff);
goto out;
}
/* Clear errors */
pci_write_config_word(dev, PCI_STATUS, 0xffff);
/* Prevent assigning a bus number that already exists.
* This can happen when a bridge is hot-plugged, so in
* this case we only re-scan this bus. */
child = pci_find_bus(pci_domain_nr(bus), max+1);
if (!child) {
child = pci_add_new_bus(bus, dev, max+1);
if (!child)
goto out;
pci_bus_insert_busn_res(child, max+1, 0xff);
}
max++;
buses = (buses & 0xff000000)
| ((unsigned int)(child->primary) << 0)
| ((unsigned int)(child->busn_res.start) << 8)
| ((unsigned int)(child->busn_res.end) << 16);
/*
* yenta.c forces a secondary latency timer of 176.
* Copy that behaviour here.
*/
if (is_cardbus) {
buses &= ~0xff000000;
buses |= CARDBUS_LATENCY_TIMER << 24;
}
/*
* We need to blast all three values with a single write.
*/
pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses);
if (!is_cardbus) {
child->bridge_ctl = bctl;
max = pci_scan_child_bus(child);
} else {
/*
* For CardBus bridges, we leave 4 bus numbers
* as cards with a PCI-to-PCI bridge can be
* inserted later.
*/
for (i = 0; i < CARDBUS_RESERVE_BUSNR; i++) {
struct pci_bus *parent = bus;
if (pci_find_bus(pci_domain_nr(bus),
max+i+1))
break;
while (parent->parent) {
if ((!pcibios_assign_all_busses()) &&
(parent->busn_res.end > max) &&
(parent->busn_res.end <= max+i)) {
j = 1;
}
parent = parent->parent;
}
if (j) {
/*
* Often, there are two cardbus bridges
* -- try to leave one valid bus number
* for each one.
*/
i /= 2;
break;
}
}
max += i;
}
/*
* Set the subordinate bus number to its real value.
*/
pci_bus_update_busn_res_end(child, max);
pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max);
}
sprintf(child->name,
(is_cardbus ? "PCI CardBus %04x:%02x" : "PCI Bus %04x:%02x"),
pci_domain_nr(bus), child->number);
/* Has only triggered on CardBus, fixup is in yenta_socket */
while (bus->parent) {
if ((child->busn_res.end > bus->busn_res.end) ||
(child->number > bus->busn_res.end) ||
(child->number < bus->number) ||
(child->busn_res.end < bus->number)) {
dev_info(&child->dev, "%pR %s hidden behind%s bridge %s %pR\n",
&child->busn_res,
(bus->number > child->busn_res.end &&
bus->busn_res.end < child->number) ?
"wholly" : "partially",
bus->self->transparent ? " transparent" : "",
dev_name(&bus->dev),
&bus->busn_res);
}
bus = bus->parent;
}
out:
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
return max;
}
至此,PCI根总线和设备树已经建立好。剩下的事情就是注册每个总线下挂的设备。
void pci_bus_add_devices(const struct pci_bus *bus)
{
struct pci_dev *dev;
struct pci_bus *child;
list_for_each_entry(dev, &bus->devices, bus_list) {
/* Skip already-added devices */
if (dev->is_added)
continue;
pci_bus_add_device(dev);
}
list_for_each_entry(dev, &bus->devices, bus_list) {
BUG_ON(!dev->is_added);
child = dev->subordinate;
if (child)
pci_bus_add_devices(child);
}
}
void pci_bus_add_device(struct pci_dev *dev)
{
int retval;
/*
* Can not put in pci_device_add yet because resources
* are not assigned yet for some devices.
*/
pci_fixup_device(pci_fixup_final, dev);
pci_create_sysfs_dev_files(dev);
pci_proc_attach_device(dev);
dev->match_driver = true;
retval = device_attach(&dev->dev); // 将设备注册进设备模型框架
WARN_ON(retval < 0);
dev->is_added = 1;
}
(2)PCI驱动的注册 pci_register_driver
#define pci_register_driver(driver) \
__pci_register_driver(driver, THIS_MODULE, KBUILD_MODNAME)
int __pci_register_driver(struct pci_driver *drv, struct module *owner,
const char *mod_name)
{
/* initialize common driver fields */
drv->driver.name = drv->name;
drv->driver.bus = &pci_bus_type;
drv->driver.owner = owner;
drv->driver.mod_name = mod_name; // 初始化 struct device_driver结构,pci_driver基于设备模型架构
spin_lock_init(&drv->dynids.lock);
INIT_LIST_HEAD(&drv->dynids.list);
/* register with core */
return driver_register(&drv->driver); // 注册 struct pci_driver内嵌的 device_driver到设备模型架构中
}
由此,可以看到,pci_driver内嵌一个device_driver,基于设备模型架构。注册一个pci_driver,其实就是将内嵌的device_driver注册进设备模型中。这部分遵循设备模型的总线/设备/驱动架构,搜索这个pci_bus中下挂的pci_dev,调用pci_bus的match()方法匹配设备和驱动的device_id,如果匹配,再调用pci_driver的probe()方法(该方法由用户设计,通常用来从pci_dev中获取设备io,irq等资源,初始化设备特定的结构)。最后将该驱动和设备关联,并将驱动加入pci_bus总线中。
总结:系统启动时,bios会构建好PCI总线和设备信息。PCI模块在初始化的过程中,首先根据bios构建好的信息,创建好根总线及下挂的子总线,并初始化,注册总线下的所有设备。至此,PCI总线和设备信息已经构建完毕。当注册pci设备驱动时,遵循linux设备模型架构,搜索该总线下所有的设备,匹配,探测并关联相应的设备。最终,基于PCI的特定设备通过pci驱动获取到 io memory,irq等资源。这就是Linux PCI总线的核心内容。