目录
1.2.1 Resouce allocation in UEFI
1.3 PCI bus scan flow in Linux
1.4.1 struct acpi_pci_root_info {}
1.4.3 struct pci_host_bridge {}
1.4.6 Resource Array[] - DEVICE_COUNT_RESOURCE
2. PCI resource sizing and assignment
2.1 pci_acpi_root_prepare_resources()
2.1.1 __acpi_dev_get_resources()
2.2.1 pci_register_host_bridge()
2.3 PCI device scan and setup - pci_setup_device()
2.3.1.2 pcibios_bus_to_resource()
2.5 PCI bridge sizing - __pci_bus_size_bridges()
2.6 PCI resource assignment - __pci_bus_assign_resources()
2.6.2 __pci_assign_resource() examples
2.6.2.1 __pci_assign_resource() for RootBus
2.6.2.2 __pci_assign_resource() for normal device
2.6.2.3 __pci_assign_resource() for bridges
4.1 Hotplug size 默认预留size 和不预留
4.2 Hotplug size 默认预留size 和不预留, 然后热插入
1. PCI Bus Scan Overview
现在大部分都是PCI bus 都是挂载的PCIe 设备,极少的server 还有Pcie-to-Pci bridge 下挂PCI(e.g. VGA), 所以文章里面 PCI 也指 PCIe, 除非特殊情况,不做特别区分。
PCI bus scan的flow以及设备注册, 已经详细介绍过:Linux Topics (4) - PCI Bus Scan from RC-CSDN博客
本章主要 deep dive, Linux 下 PCI bus enumerate 过程中的资源(mmio etc)收集和分配,
以及对比UEFI 下PCI bus enumerate 的差异, 带着以下的问题整理下Linux PCI 资源分配:
1. Linux 的 mmio 资源需不需要 重新收集和分配?
2. 如果重新分配, Linux 如何获取某个 RC的资源区间(开始,结束,大小)?
3. Linux 的mmio 资源分配和 UEFI的资源分配的关系?
4. hotplug 的支持与否对资源分配的影响?
1.1 PCI config space
Type0 和Type1 配置空间的寄存器以及图还是贴一下,便于参考
BAR Attributes:
1.2 PCI bus scan UEFI
1.2.1 Resouce allocation in UEFI
UEFI 的代码不是本文的重点, 这里就贴一下 UEFI 阶段的PCIE bus enumeration 后, 资源的分配(主要针对举例的RC1), 这样便于跟后面OS阶段的 resource allocation 作对比:
使用的UEFI 代码是深度优先递归扫描的, 所有资源分配的时候:
1. 某bus下游设备,设备号大的会被先分配
2. 某个设备来讲,BAR 号大的先被分配
1.3 PCI bus scan flow in Linux
acpi_pci_root_add()
pci_acpi_scan_root()
root_ops->release_info = pci_acpi_generic_release_info;
root_ops->prepare_resources = pci_acpi_root_prepare_resources;
acpi_pci_root_create()
|-->ops->prepare_resources(info);
| |-->pci_acpi_root_prepare_resources(); //Get resource from ACPI(_CRS..)
|--> pci_acpi_root_add_resources(info);
|-->pci_create_root_bus() //注册host brg, 以及root pci bus!
| |-->pci_register_host_bridge()
|-->pci_scan_child_bus(bus);
| pci_scan_child_bus_extend(bus, 0);
| pci_scan_slot(bus, devfn); //递归扫描,广度优先
| pci_scan_single_device()
| |-->pci_scan_device(bus, devfn); //扫描slot,并创建pci dev
| | |-->pci_setup_device() //记录设备的BAR 资源
| | |-->pci_read_bases()
| |-->pci_device_add(dev, bus);//addPCIe类型设备(包括桥)
| pci_scan_bridge_extend (bus, dev, max, ); //Scan buses
| pci_add_new_bus(..)
| child=pci_alloc_child_bus((parent, dev //Create child bus for bridge
| child->dev.class = &pcibus_class;
| dev_set_name(&child->dev, "%04x:%02x", ...
| child->self = bridge; //Bus itself means its bridge
| child->bridge = get_device(&bridge->dev);
| for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
| child->resource[i] = &bridge->resource[...
| child->resource[i]->name = child->name;
| }
| device_register(&child->dev);
|-->pci_assign_unassigned_root_bus_resources()
| |--> __pci_bus_size_bridges()
| |--> pci_bridge_check_ranges()
| |-->pbus_size_io()
| |-->pbus_size_mem()
| |-->__pci_bus_assign_resources()
|-->pbus_assign_resources_sorted()
| __dev_sort_resources()
| pdev_sort_resources()
| __assign_resources_sorted()
| assign_requested_resources_sorted()
| pci_assign_resource()
| |--> __pci_assign_resource()
| |-->pci_bus_alloc_resource()
| pci_bus_alloc_from_region()
| pci_clip_resource_to_region()
| |-->pcibios_bus_to_resource()
| |--> pci_update_resource()
| |-->pci_std_update_resource() //更新设备的BAR
|--> pci_setup_bridge(); //将bridge 资源设定写回Config winodw
__pci_setup_bridge()
|-->pci_setup_bridge_io()
|-->pci_setup_bridge_mmio()
|-->pci_setup_bridge_mmio_pref()
1.4 Structure Definitions
1.4.1 struct acpi_pci_root_info {}
ACPI 初始化时用来获取 RC的 _CRS
struct acpi_pci_root_info {
struct acpi_pci_root *root;
struct acpi_device *bridge;
struct acpi_pci_root_ops *ops;
struct list_head resources;
char name[16];
};
1.4.2 struct acpi_pci_root {}
struct acpi_pci_root {
struct acpi_device * device;
struct pci_bus *bus;
u16 segment;
int bridge_type;
struct resource secondary; /* downstream bus range */
u32 osc_support_set; /* _OSC state of support bits */
u32 osc_control_set; /* _OSC state of control bits */
u32 osc_ext_support_set; /* _OSC state of extended support bits */
u32 osc_ext_control_set; /* _OSC state of extended control bits */
phys_addr_t mcfg_addr;
};
1.4.3 struct pci_host_bridge {}
struct pci_host_bridge {
struct device dev;
struct pci_bus *bus; /* Root bus */
struct pci_ops *ops;
struct pci_ops *child_ops;
void *sysdata;
int busnr;
int domain_nr;
struct list_head windows; /* resource_entry */
...
unsigned int ignore_reset_delay:1; /* For entire hierarchy */
unsigned int no_ext_tags:1; /* No Extended Tags */
unsigned int no_inc_mrrs:1; /* No Increase MRRS */
unsigned int native_aer:1; /* OS may use PCIe AER */
unsigned int native_pcie_hotplug:1; /* OS may use PCIe hotplug */
unsigned int native_shpc_hotplug:1; /* OS may use SHPC hotplug */
unsigned int native_pme:1; /* OS may use PCIe PME */
unsigned int native_ltr:1; /* OS may use PCIe LTR */
unsigned int native_dpc:1; /* OS may use PCIe DPC */
unsigned int native_cxl_error:1; /* OS may use CXL RAS/Events */
unsigned int preserve_config:1; /* Preserve FW resource setup */
unsigned int size_windows:1; /* Enable root bus sizing */
unsigned int msi_domain:1; /* Bridge wants MSI domain */
}
1.4.4 struct pci_bus {}
struct pci_bus {
struct list_head node; /* Node in list of buses */
struct pci_bus *parent; /* Parent bus this bridge is on */
struct list_head children; /* List of child buses */
struct list_head devices; /* List of devices on this bus */
struct pci_dev *self; /* Bridge device as seen by parent */
struct list_head slots; /* List of slots on this busprotected by pci_slot_mutex */
struct resource *resource[PCI_BRIDGE_RESOURCE_NUM];
struct list_head resources; /* Address space routed to this bus */
…
struct resource busn_res; /* Bus numbers routed to this bus */
struct pci_ops *ops; /* Configuration access functions */
void *sysdata; /* Hook for sys-specific extension */
struct proc_dir_entry *procdir; /* Directory entry in /proc/bus/pci */
unsigned char number; /* Bus number */
unsigned char primary; /* Number of primary bridge */
unsigned char max_bus_speed; /* enum pci_bus_speed */
unsigned char cur_bus_speed; /* enum pci_bus_speed */
#ifdef CONFIG_PCI_DOMAINS_GENERIC
int domain_nr;
#endif
char name[48];
unsigned short bridge_ctl; /* Manage NO_ISA/FBB/et al behaviors */
pci_bus_flags_t bus_flags; /* Inherited by child buses */
struct device *bridge;
struct device dev;
struct bin_attribute *legacy_io; /* Legacy I/O for this bus */
struct bin_attribute *legacy_mem; /* Legacy mem */
unsigned int is_added:1;
unsigned int unsafe_warn:1; /* warned about RW1C config write */
};
示例:
1.4.5 struct pci_dev {}
/* The pci_dev structure describes PCI devices */
struct pci_dev {
struct list_head bus_list; /* Node in per-bus list */
struct pci_bus *bus; /* Bus this device is on */
struct pci_bus *subordinate; /* Bus this device bridges to */
void *sysdata; /* Hook for sys-specific extension */
struct proc_dir_entry *procent; /* Device entry in /proc/bus/pci */
struct pci_slot *slot; /* Physical slot this device is in */
unsigned int devfn; /* Encoded device & function index */
unsigned short vendor;
unsigned short device;
…
struct device dev; /* Generic device interface */
…
/* I/O and memory regions + expansion ROMs */
struct resource resource[DEVICE_COUNT_RESOURCE];
}
需要注意的是resource 的定义, 针对普通设备,和桥设备都有预定义,如下:
1.4.6 Resource Array[] - DEVICE_COUNT_RESOURCE
/* For PCI devices, the region numbers are assigned this way: */
enum {
/* #0-5: standard PCI resources */
PCI_STD_RESOURCES,
PCI_STD_RESOURCE_END = PCI_STD_RESOURCES + PCI_STD_NUM_BARS - 1,
/* #6: expansion ROM resource */
PCI_ROM_RESOURCE,
/* Device-specific resources */
#ifdef CONFIG_PCI_IOV
PCI_IOV_RESOURCES, // PCI_SRIOV_NUM_BARS 6
PCI_IOV_RESOURCE_END = PCI_IOV_RESOURCES + PCI_SRIOV_NUM_BARS - 1,
#endif
/* PCI-to-PCI (P2P) bridge windows */
#define PCI_BRIDGE_IO_WINDOW (PCI_BRIDGE_RESOURCES + 0)
#define PCI_BRIDGE_MEM_WINDOW (PCI_BRIDGE_RESOURCES + 1)
#define PCI_BRIDGE_PREF_MEM_WINDOW (PCI_BRIDGE_RESOURCES + 2)
/* CardBus bridge windows */
#define PCI_CB_BRIDGE_IO_0_WINDOW (PCI_BRIDGE_RESOURCES + 0)
#define PCI_CB_BRIDGE_IO_1_WINDOW (PCI_BRIDGE_RESOURCES + 1)
#define PCI_CB_BRIDGE_MEM_0_WINDOW (PCI_BRIDGE_RESOURCES + 2)
#define PCI_CB_BRIDGE_MEM_1_WINDOW (PCI_BRIDGE_RESOURCES + 3)
/* Total number of bridge resources for P2P and CardBus */
#define PCI_BRIDGE_RESOURCE_NUM 4
/* Resources assigned to buses behind the bridge */
PCI_BRIDGE_RESOURCES,
PCI_BRIDGE_RESOURCE_END = PCI_BRIDGE_RESOURCES +
PCI_BRIDGE_RESOURCE_NUM - 1,
/* Total resources associated with a PCI device */
PCI_NUM_RESOURCES,
/* Preserve this for compatibility */
DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES,
};
示例:
[root@localhost linux]# cat /sys/devices/pci0001\:00/0001\:00\:01.0/resource
0x0000000000000000 0x0000000000000000 0x0000000000000000 //BAR 0
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000 //BAR 6: ROM
0x0000000000000000 0x0000000000000000 0x0000000000000000 //BAR 7: IOV0
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000
0x0000000000000000 0x0000000000000000 0x0000000000000000 //BAR 12:IOV5
0x0000000000000000 0x0000000000000000 0x0000000000000000 //BAR 13: IO window
0x0000740004000000 0x00007400041fffff 0x0000000000000200 //BAR 14: Mem window
0x0000740100000000 0x00007401001fffff 0x0000000000102201 //BAR 15: 64bits PMem Wind
0x0000000000000000 0x0000000000000000 0x0000000000000000
1.4.7 Resource Attributes
#define IORESOURCE_TYPE_BITS 0x00001f00 /* Resource type */
#define IORESOURCE_IO 0x00000100 /* PCI/ISA I/O ports */
#define IORESOURCE_MEM 0x00000200
#define IORESOURCE_REG 0x00000300 /* Register offsets */
#define IORESOURCE_IRQ 0x00000400
#define IORESOURCE_DMA 0x00000800
#define IORESOURCE_BUS 0x00001000
#define IORESOURCE_PREFETCH 0x00002000 /* No side effects */
#define IORESOURCE_READONLY 0x00004000
#define IORESOURCE_CACHEABLE 0x00008000
#define IORESOURCE_RANGELENGTH 0x00010000
#define IORESOURCE_SHADOWABLE 0x00020000
#define IORESOURCE_SIZEALIGN 0x00040000 /* size indicates alignment */
#define IORESOURCE_STARTALIGN 0x00080000 /* start field is alignment */
#define IORESOURCE_MEM_64 0x00100000
#define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */
#define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */
#define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */
#define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */
/* IORESOURCE_SYSRAM specific bits. */
#define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */
#define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */
#define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */
#define IORESOURCE_DISABLED 0x10000000
#define IORESOURCE_UNSET 0x20000000 /* No address assigned yet */
#define IORESOURCE_AUTO 0x40000000
#define IORESOURCE_BUSY 0x80000000 /* Driver has marked this resource busy */
2. PCI resource sizing and assignment
Summary:
1. pci_acpi_root_prepare_resources():获取 PCI RC 在ACPI table中定义的PCI dev _CRS资源
然后将资源传递给host bridge window, 或者说是Root bus resources.
2. pci_create_root_bus() and pci_register_host_bridge() : 创建Root bus 和host bridge, 对于每个RC 都是唯一的
3. pci_setup_device(): 该函数用来初始化 PCI 设备(包括bridge 和endpoint device), 重要的就是 pci_read_bases() , 会去读PCI BAR 来收集当前设备的资源信息(也就是UEFI分配的地址)
- 4. __pci_bus_size_bridges(): 针对桥的资源进行整合,
- a. 重要点是是否支持 热插拔, 如果是的话,要对桥的资源进行预留,
- b. 下游是否有设备,对应的窗口要计算上面下游设备需要的空间size
5. __pci_bus_assign_resources(): 这是一个递归过程,从Root bus 往下游重复
a. pbus_assign_resources_sorted(): 给当前bus的设备分配资源
• pci_std_update_resource() : 普通设备调用, 写BARx 寄存器
b. pci_setup_bridge(): 当bridge 下游设备资源都更新完后,配置bridge 窗口
给桥window配置 io/mem/pref_mem, 写config window 寄存器
• pci_setup_bridge_io()
• pci_setup_bridge_mmio()
• pci_setup_bridge_mmio_pref()
Example RC1:
后面结合实际的资源分配就是以下面的 Topology 为参考 ,
ACPI RC device 资源上报:
2.1 pci_acpi_root_prepare_resources()
acpi_pci_root_add()
pci_acpi_scan_root()
root_ops->release_info = pci_acpi_generic_release_info;
root_ops->prepare_resources = pci_acpi_root_prepare_resources;
acpi_pci_root_create()
|-->ops->prepare_resources(info);
| |-->pci_acpi_root_prepare_resources; //Get resource from ACPI(_CRS..)
//获取ACPI PCI RC device 上报的资源(_CRS)
pci_acpi_root_prepare_resources()
acpi_pci_probe_root_resources()
acpi_dev_get_resources()
__acpi_dev_get_resources(adev, list, .., METHOD_NAME__CRS);
当下面的函数执行完毕,ci->resources 链表上面挂载的就是 ACPI PCI device _CRS method 传过来的资源(_MIN, _MAX, _LEN, _TRA).
可以加上debug log 打印, 这样就一目了然,
static int pci_acpi_root_prepare_resources(struct acpi_pci_root_info *ci)
{
struct resource_entry *entry, *tmp;
int status;
status = acpi_pci_probe_root_resources(ci);
resource_list_for_each_entry_safe(entry, tmp, &ci->resources) {
if (!(entry->res->flags & IORESOURCE_WINDOW))
resource_list_destroy_entry(entry);
// pr_info(" entry->res->start=0x%llx, entry->res->end=0x%llx, offset=0x%llx\n",entry->res->start,entry->res->end, entry->offset);
}
return status;
}
例子:
PCI host bridge to bus 0001:00
pci_bus 0001:00: root bus resource [mem 0x740004000000-0x740013ffffff window] (bus address [0x04000000-0x13ffffff])
pci_bus 0001:00: root bus resource [mem 0x740100000000-0x77ffdfffffff window]
pci_bus 0001:00: root bus resource [bus 00-ff]
这里的俩项资源其实是ACPI PCI Device RC1 _CRS 传过来的。
2.1.1 __acpi_dev_get_resources()
比如: __acpi_dev_get_resources(adev, list, .., METHOD_NAME__CRS);
TBD
2.2 pci_create_root_bus()
struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
struct pci_ops *ops, void *sysdata, struct list_head *resources)
{
int error;
struct pci_host_bridge *bridge;
bridge = pci_alloc_host_bridge(0);
if (!bridge) return NULL;
bridge->dev.parent = parent;
list_splice_init(resources, &bridge->windows); // info->resources
bridge->sysdata = sysdata;
bridge->busnr = bus;
bridge->ops = ops;
error = pci_register_host_bridge(bridge);
return bridge->bus;
err_out:
put_device(&bridge->dev);
return NULL;
}
2.2.1 pci_register_host_bridge()
static int pci_register_host_bridge(struct pci_host_bridge *bridge)
{
struct device *parent = bridge->dev.parent;
struct resource_entry *window, *next, *n;
struct pci_bus *bus, *b;
resource_size_t offset, next_offset;
LIST_HEAD(resources);
struct resource *res, *next_res;
char addr[64], *fmt;
const char *name;
int err;
bus = pci_alloc_bus(NULL);
if (!bus)
return -ENOMEM;
bridge->bus = bus;
bus->sysdata = bridge->sysdata;
bus->ops = bridge->ops;
bus->number = bus->busn_res.start = bridge->busnr;
#ifdef CONFIG_PCI_DOMAINS_GENERIC
if (bridge->domain_nr == PCI_DOMAIN_NR_NOT_SET)
bus->domain_nr = pci_bus_find_domain_nr(bus, parent);
else
bus->domain_nr = bridge->domain_nr;
if (bus->domain_nr < 0) {
err = bus->domain_nr;
goto free;
}
#endif
b = pci_find_bus(pci_domain_nr(bus), bridge->busnr);
if (b) {
/* Ignore it if we already got here via a different bridge */
dev_dbg(&b->dev, "bus already known\n");
err = -EEXIST;
goto free;
}
//host bridge
dev_set_name(&bridge->dev, "pci%04x:%02x", pci_domain_nr(bus), bridge->busnr);
err = pcibios_root_bridge_prepare(bridge);
/* Temporarily move resources off the list */
list_splice_init(&bridge->windows, &resources);
err = device_add(&bridge->dev);
if (err) {
put_device(&bridge->dev);
goto free;
}
bus->bridge = get_device(&bridge->dev);
device_enable_async_suspend(bus->bridge);
pci_set_bus_of_node(bus);
pci_set_bus_msi_domain(bus);
if (bridge->msi_domain && !dev_get_msi_domain(&bus->dev) &&
!pci_host_of_has_msi_map(parent))
bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI;
if (!parent)
set_dev_node(bus->bridge, pcibus_to_node(bus));
bus->dev.class = &pcibus_class;
bus->dev.parent = bus->bridge;
dev_set_name(&bus->dev, "%04x:%02x", pci_domain_nr(bus), bus->number);
name = dev_name(&bus->dev);
err = device_register(&bus->dev); // /sys/class/pci_bus/0007\:00/
if (err)
goto unregister;
pcibios_add_bus(bus);
if (bus->ops->add_bus) {
err = bus->ops->add_bus(bus);
if (WARN_ON(err < 0))
dev_err(&bus->dev, "failed to add bus: %d\n", err);
}
/* Create legacy_io and legacy_mem files for this bus */
pci_create_legacy_files(bus);
if (parent) dev_info(parent, "PCI host bridge to bus %s\n", name);
else pr_info("PCI host bridge to bus %s\n", name);
if (nr_node_ids > 1 && pcibus_to_node(bus) == NUMA_NO_NODE)
dev_warn(&bus->dev, "Unknown NUMA node; performance will be reduced\n");
/* Coalesce contiguous windows */
resource_list_for_each_entry_safe(window, n, &resources) {
if (list_is_last(&window->node, &resources))
break;
next = list_next_entry(window, node);
offset = window->offset;
res = window->res;
next_offset = next->offset;
next_res = next->res;
if (res->flags != next_res->flags || offset != next_offset)
continue;
if (res->end + 1 == next_res->start) {
next_res->start = res->start;
res->flags = res->start = res->end = 0;
}
}
/* Add initial resources to the bus */
resource_list_for_each_entry_safe(window, n, &resources) {
offset = window->offset;
res = window->res;
if (!res->flags && !res->start && !res->end)
continue;
list_move_tail(&window->node, &bridge->windows);
if (res->flags & IORESOURCE_BUS)
pci_bus_insert_busn_res(bus, bus->number, res->end);
else
pci_bus_add_resource(bus, res, 0); //这里是Root Bus 指向Bridge的Res[]
if (offset) {
if (resource_type(res) == IORESOURCE_IO)
fmt = " (bus address [%#06llx-%#06llx])";
else
fmt = " (bus address [%#010llx-%#010llx])";
snprintf(addr, sizeof(addr), fmt, (unsigned long long)(res->start - offset),
(unsigned long long)(res->end - offset));
} else
addr[0] = '\0';
dev_info(&bus->dev, "root bus resource %pR%s\n", res, addr);
}
down_write(&pci_bus_sem);
list_add_tail(&bus->node, &pci_root_buses);
up_write(&pci_bus_sem);
return 0;
...
return err;
}
示例:
2.2.2 Example RC1 的扫描示意图
Linux log:
2.3 PCI device scan and setup - pci_setup_device()
Root bus 创建完成后,就要从Root bus 进行递归扫描:
1. 扫描Root bus的设备, 创建设备,挂载到对应的链表,同时
2. 如果bridge 设备,需要创建child bus and 递归扫描(后面)
int pci_setup_device(struct pci_dev *dev)
{
u32 class;
u16 cmd;
u8 hdr_type;
int err, pos = 0;
struct pci_bus_region region;
struct resource *res;
hdr_type = pci_hdr_type(dev);
dev->sysdata = dev->bus->sysdata;
dev->dev.parent = dev->bus->bridge;
dev->dev.bus = &pci_bus_type;
dev->hdr_type = hdr_type & 0x7f;
dev->multifunction = !!(hdr_type & 0x80);
dev->error_state = pci_channel_io_normal;
set_pcie_port_type(dev);
err = pci_set_of_node(dev);
pci_set_acpi_fwnode(dev);
pci_dev_assign_slot(dev);
/** Assume 32-bit PCI; let 64-bit PCI cards (which are far rarer)
* set this higher, assuming the system even supports it*/
dev->dma_mask = 0xffffffff;
dev_set_name(&dev->dev, "%04x:%02x:%02x.%d", pci_domain_nr(dev->bus),
dev->bus->number, PCI_SLOT(dev->devfn), PCI_FUNC(dev->devfn));
class = pci_class(dev);
dev->revision = class & 0xff;
dev->class = class >> 8; /* upper 3 bytes */
if (pci_early_dump)
early_dump_pci_device(dev);
...
/* "Unknown power state" */
dev->current_state = PCI_UNKNOWN;
/* Early fixups, before probing the BARs */
pci_fixup_device(pci_fixup_early, dev);
pci_set_removable(dev);
pci_info(dev, "[%04x:%04x] type %02x class %#08x\n",
dev->vendor, dev->device, dev->hdr_type, dev->class);
/* Device class may be changed after fixup */
class = dev->class >> 8;
…
switch (dev->hdr_type) { /* header type */
case PCI_HEADER_TYPE_NORMAL: /* standard header */
if (class == PCI_CLASS_BRIDGE_PCI)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
pci_subsystem_ids(dev, &dev->subsystem_vendor, &dev->subsystem_device);
}
break;
case PCI_HEADER_TYPE_BRIDGE: /* bridge header */
/* The PCI-to-PCI bridge spec requires that subtractive
* decoding (i.e. transparent) bridge must have programming * interface code of 0x01 */
pci_read_irq(dev);
dev->transparent = ((dev->class & 0xff) == 1);
pci_read_bases(dev, 2, PCI_ROM_ADDRESS1);
pci_read_bridge_windows(dev);
set_pcie_hotplug_bridge(dev);
pos = pci_find_capability(dev, PCI_CAP_ID_SSVID);
if (pos) {
pci_read_config_word(dev, pos + PCI_SSVID_VENDOR_ID, &dev->subsystem_vendor);
pci_read_config_word(dev, pos + PCI_SSVID_DEVICE_ID, &dev->subsystem_device);
}
break;
case PCI_HEADER_TYPE_CARDBUS: /* CardBus bridge header */
if (class != PCI_CLASS_BRIDGE_CARDBUS)
goto bad;
pci_read_irq(dev);
pci_read_bases(dev, 1, 0);
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_VENDOR_ID, &dev->subsystem_vendor);
pci_read_config_word(dev, PCI_CB_SUBSYSTEM_ID, &dev->subsystem_device);
break;
..
}
2.3.1 pci_read_bases()
普通设备和bridge都会调用该函数:
1. 普通设备: pci_read_bases(dev, 6, PCI_ROM_ADDRESS);
2. bridge: pci_read_bases(dev, 2, PCI_ROM_ADDRESS1)
static void pci_read_bases(struct pci_dev *dev, unsigned int howmany, int rom)
{
unsigned int pos, reg;
if (dev->non_compliant_bars) return;
/* Per PCIe r4.0, sec 9.3.4.1.11, the VF BARs are all RO Zero */
if (dev->is_virtfn) return;
for (pos = 0; pos < howmany; pos++) {
struct resource *res = &dev->resource[pos]; //设备的resource数组,存放BAR的信息
reg = PCI_BASE_ADDRESS_0 + (pos << 2);
pos += __pci_read_base(dev, pci_bar_unknown, res, reg);
}
if (rom) {
struct resource *res = &dev->resource[PCI_ROM_RESOURCE];
dev->rom_base_reg = rom;
res->flags = IORESOURCE_MEM | IORESOURCE_PREFETCH | //why opROM BAR masked as prefetch?
IORESOURCE_READONLY | IORESOURCE_SIZEALIGN;
__pci_read_base(dev, pci_bar_mem32, res, rom);
}
}
2.3.1.1 __pci_read_base()
读取UEFI 阶段配置的BAR的 MMIO 地址空间,需要注意的是:
1. 如果host bridge resource 有 offset(也就是 _TRA, 基地址), 那么设备res 需要加上这个offset
2. 就算UEFI阶段配置的地址, 不是落在ACPI 上报给OS的区间(比如重新安排了地址空间),那么当前的函数还是会记录UEFI 阶段配置的地址空间(这个只是假设,做实验用)
int __pci_read_base(struct pci_dev *dev, enum pci_bar_type type,
struct resource *res, unsigned int pos)
{
..
pci_read_config_dword(dev, pos, &l);
pci_write_config_dword(dev, pos, l | mask);
pci_read_config_dword(dev, pos, &sz);
pci_write_config_dword(dev, pos, l)
if (type == pci_bar_unknown) {
res->flags = decode_bar(dev, l);
res->flags |= IORESOURCE_SIZEALIGN;
if (res->flags & IORESOURCE_IO) {
l64 = l & PCI_BASE_ADDRESS_IO_MASK;
sz64 = sz & PCI_BASE_ADDRESS_IO_MASK;
mask64 = PCI_BASE_ADDRESS_IO_MASK & (u32)IO_SPACE_LIMIT;
} else {
l64 = l & PCI_BASE_ADDRESS_MEM_MASK;
sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK;
mask64 = (u32)PCI_BASE_ADDRESS_MEM_MASK;
}
} else {
if (l & PCI_ROM_ADDRESS_ENABLE)
res->flags |= IORESOURCE_ROM_ENABLE;
l64 = l & PCI_ROM_ADDRESS_MASK;
sz64 = sz & PCI_ROM_ADDRESS_MASK;
mask64 = PCI_ROM_ADDRESS_MASK;
}
if (res->flags & IORESOURCE_MEM_64) {
pci_read_config_dword(dev, pos + 4, &l);
pci_write_config_dword(dev, pos + 4, ~0);
pci_read_config_dword(dev, pos + 4, &sz);
pci_write_config_dword(dev, pos + 4, l);
l64 |= ((u64)l << 32);
sz64 |= ((u64)sz << 32);
mask64 |= ((u64)~0 << 32);
}
region.start = l64;
region.end = l64 + sz64 - 1;
pcibios_bus_to_resource(dev->bus, res, ®ion); //*res = &dev->resource[pos];
…
fail:
res->flags = 0;
out:
if (res->flags)
pci_info(dev, "reg 0x%x: %pR\n", pos, res);
return (res->flags & IORESOURCE_MEM_64) ? 1 : 0;
}
2.3.1.2 pcibios_bus_to_resource()
这里对资源区间的检查, 需要注意的是 设备的资源是否需要加上bridge的 offset(基地址)。
=>
=>
void pcibios_bus_to_resource(struct pci_bus *bus, struct resource *res,
struct pci_bus_region *region)
{
struct pci_host_bridge *bridge = pci_find_host_bridge(bus);
struct resource_entry *window;
resource_size_t offset = 0;
resource_list_for_each_entry(window, &bridge->windows) {
struct pci_bus_region bus_region;
if (resource_type(res) != resource_type(window->res))
continue;
bus_region.start = window->res->start - window->offset;
bus_region.end = window->res->end - window->offset;
if (region_contains(&bus_region, region)) {
offset = window->offset;
break;
}
}
res->start = region->start + offset;
res->end = region->end + offset;
}
EXPORT_SYMBOL(pcibios_bus_to_resource);
2.3.2 Example RC1 的扫描示意图
Linux log:
2.4 pci_add_new_bus()
static struct pci_bus *pci_alloc_child_bus(struct pci_bus *parent,
struct pci_dev *bridge, int busnr)
{
struct pci_bus *child;
struct pci_host_bridge *host;
int i;
int ret;
/* Allocate a new bus and inherit stuff from the parent */
child = pci_alloc_bus(parent);
child->parent = parent;
child->sysdata = parent->sysdata;
child->bus_flags = parent->bus_flags;
host = pci_find_host_bridge(parent);
if (host->child_ops) child->ops = host->child_ops;
else child->ops = parent->ops;
/* Initialize some portions of the bus device, but don't register
* it now as the parent is not properly set up yet. */
child->dev.class = &pcibus_class;
dev_set_name(&child->dev, "%04x:%02x", pci_domain_nr(child), busnr);
/* Set up the primary, secondary and subordinate bus numbers */
child->number = child->busn_res.start = busnr;
child->primary = parent->busn_res.start;
child->busn_res.end = 0xff;
..
child->self = bridge;
child->bridge = get_device(&bridge->dev);
child->dev.parent = child->bridge;
/* Set up default resource pointers and names */
for (i = 0; i < PCI_BRIDGE_RESOURCE_NUM; i++) {
child->resource[i] = &bridge->resource[PCI_BRIDGE_RESOURCES+i]; //指向相同的资源!
child->resource[i]->name = child->name;
}
bridge->subordinate = child;
add_dev:
pci_set_bus_msi_domain(child);
ret = device_register(&child->dev);
WARN_ON(ret < 0);
pcibios_add_bus(child);
if (child->ops->add_bus) {
ret = child->ops->add_bus(child);
if (WARN_ON(ret < 0))
dev_err(&child->dev, "failed to add bus: %d\n", ret);
}
/* Create legacy_io and legacy_mem files for this bus */
pci_create_legacy_files(child);
return child;
}
2.4.1 Example RC1 的扫描示意图
2.5 PCI bridge sizing - __pci_bus_size_bridges()
主要是针对bridge下游设备资源的收集:
1. 正常的下游设备的资源保留
2. 假如支持热插拔, 那么针对每个bridge都需要保留设定的资源(IO/MMIO/PMMIO)
#define DEFAULT_HOTPLUG_IO_SIZE (256)
#define DEFAULT_HOTPLUG_MMIO_SIZE (2*1024*1024)
#define DEFAULT_HOTPLUG_MMIO_PREF_SIZE (2*1024*1024)
/* hpiosize=nn can override this */
unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE;
/*pci=hpmmiosize=nnM overrides non-prefetchable MMIO size,
* pci=hpmmioprefsize=nnM overrides prefetchable MMIO size;
* pci=hpmemsize=nnM overrides both*/
unsigned long pci_hotplug_mmio_size = DEFAULT_HOTPLUG_MMIO_SIZE;
unsigned long pci_hotplug_mmio_pref_size = DEFAULT_HOTPLUG_MMIO_PREF_SIZE;
2.5.1 Example RC1 的扫描示意图
2.6 PCI resource assignment - __pci_bus_assign_resources()
//1. 从Root bus 给先给root bus 上面设备分配资源
//2. 再对root bus 上面的bridge 下游pci_bus 上面挂载的设备进行分配资源
- 如果是普通设备, 分配后,同步写回config space的BAR - pci_update_resource()
- 如果是bridge 设备,先给bridge 下游bus上面的设备分配资源,然后递归下游设备的资源分配后
- 同样也要写回 config space window BAR - pci_setup_bridge()
void __pci_bus_assign_resources(const struct pci_bus *bus,
struct list_head *realloc_head,
struct list_head *fail_head)
{
struct pci_bus *b;
struct pci_dev *dev;
pbus_assign_resources_sorted(bus, realloc_head, fail_head);
list_for_each_entry(dev, &bus->devices, bus_list) {
pdev_assign_fixed_resources(dev);
b = dev->subordinate;
if (!b)
continue;
__pci_bus_assign_resources(b, realloc_head, fail_head);
switch (dev->hdr_type) {
case PCI_HEADER_TYPE_BRIDGE:
if (!pci_is_enabled(dev))
pci_setup_bridge(b);
break;
case PCI_HEADER_TYPE_CARDBUS:
pci_setup_cardbus(b);
break;
default:
pci_info(dev, "not setting up bridge for bus %04x:%02x\n",
pci_domain_nr(b), b->number);
break;
}
}
}
2.6.1 pci_assign_resource()
int pci_assign_resource(struct pci_dev *dev, int resno)
{
struct resource *res = dev->resource + resno;
resource_size_t align, size;
int ret;
if (res->flags & IORESOURCE_PCI_FIXED)
return 0;
res->flags |= IORESOURCE_UNSET;
align = pci_resource_alignment(dev, res);
if (!align) {
pci_info(dev, "BAR %d: can't assign %pR (bogus alignment)\n",
resno, res);
return -EINVAL;
}
size = resource_size(res);
ret = _pci_assign_resource(dev, resno, size, align);
…
res->flags &= ~IORESOURCE_UNSET;
res->flags &= ~IORESOURCE_STARTALIGN;
pci_info(dev, "BAR %d: assigned %pR\n", resno, res);
if (resno < PCI_BRIDGE_RESOURCES)
pci_update_resource(dev, resno);
return 0;
}
2.6.2 __pci_assign_resource() examples
2.6.2.1 __pci_assign_resource() for RootBus
2.6.2.2 __pci_assign_resource() for normal device
2.6.2.3 __pci_assign_resource() for bridges
2.6.3 Example RC1 的扫描示意图
1. Allocating Res for first level bridges
2. Allocating for end normal devices
更新dev resource[]
写回device BARs
3. Allocating Res for bridges - pci_setup_bridge()
写回bridge window
3. 结论
1. Linux 的 mmio 资源需不需要 重新收集和分配?
A: 需要重新收集和分配的
2. 如果重新分配, Linux 如何获取某个 RC的资源区间(开始,结束,大小)?
A: 需要重新收集和分配的, 具体参考pci_read_bases()
3. Linux 的mmio 资源分配和 UEFI的资源分配的关系?
A: 独立的,逻辑类似
4. hotplug 的支持与否对资源分配的影响?
A: 默认会预留,可以通过 Boot Args 取消预留。
4. Backup
4.1 Hotplug size 默认预留size 和不预留
左边: Default
右边: --args="pci=hpmemsize=0 pci=hpiosize=0 pci=earlydump"
Summary: If set hp size=0, then resource assignment in Linux is similar to UEFI and no reservation for bridges
4.2 Hotplug size 默认预留size 和不预留, 然后热插入
左边: Default
右边: --args="pci=hpmemsize=0 pci=hpiosize=0 pci=earlydump"
分别插入SSD, 可以发现,预留资源的享受了预分配,没有预留资源的就要进行按需分配