Linux 下 pcie 初始化设备枚举流程代码分析
1、简介
以 rk3568 pcie 代码为例,简要介绍一下 pcie 初始化设备枚举的过程。比较重要的函数,就是 pci_scan_child_bus_extend
、pci_scan_bridge_extend
,这两个函数是递归的核心。简要函数调用流程如下:
2、pci_scan_child_bus_extend
- 该函数入参 bus 为 pci_bus 结构,表示一个总线资源。该函数首次被调用时,传递下来的是 root bus 结构
- 该函数作用为扫描并创建当前 bus (入参)上的所有 pci 设备(可能是 endpoint 设备,也可能是桥设备)
- 当前 bus 扫描结束,调用
pci_scan_bridge_extend
尝试遍历下一级 bus - 返回值为新的 subordinate number(根据不断递归获取到当前 bus 下的最最深一级的 bus 号)
/*** pci_scan_child_bus_extend() - Scan devices below a bus* @bus: Bus to scan for devices* @available_buses: Total number of buses available (%0 does not try to* extend beyond the minimal)** Scans devices below @bus including subordinate buses. Returns new* subordinate number including all the found devices. Passing* @available_buses causes the remaining bus space to be distributed* equally between hotplug-capable bridges to allow future extension of the* hierarchy.*/
static unsigned int pci_scan_child_bus_extend(struct pci_bus *bus,unsigned int available_buses)
{....../* Go find them, Rover! *//* 这里的含义,是遍历当前 bus 上的所有设备。针对遍历到的设备,创建 pci_dev 结构,挂载 pci_bus 结构的链表上 */for (devfn = 0; devfn < 256; devfn += 8) {nr_devs = pci_scan_slot(bus, devfn);/** The Jailhouse hypervisor may pass individual functions of a* multi-function device to a guest without passing function 0.* Look for them as well.*/if (jailhouse_paravirt() && nr_devs == 0) {for (fn = 1; fn < 8; fn++) {/* * 这个函数会去创建、初始化 pci 设备(包括 endpoint 设备与桥设备) * 包括但不限于 BAR 空间的初始化、中断资源的初始化、设备 capability 使能等*/dev = pci_scan_single_device(bus, devfn + fn);if (dev)dev->multifunction = 1;}}}/* Reserve buses for SR-IOV capability *//* 还记得前面 SR-IOV 章节的保留 bus 号么?没有阅读的,可以先去阅读下 */used_buses = pci_iov_bus_range(bus);max += used_buses;....../** Scan bridges that are already configured. We don't touch them* unless they are misconfigured (which will be done in the second* scan below).*//* for 循环这里是遍历当前 bus 上的所有桥设备(不包括 endpoint 设备!)*/for_each_pci_bridge(dev, bus) {cmax = max;/* 这里是为了处理 BIOS/Boot 中已经被配置好的 pci 桥, 这个是为了兼容各个架构所做的妥协 */max = pci_scan_bridge_extend(bus, dev, max, 0, 0);/** Reserve one bus for each bridge now to avoid extending* hotplug bridges too much during the second scan below.*/used_buses++;if (cmax - max > 1)used_buses += cmax - max - 1;}/* Scan bridges that need to be reconfigured *//* for 循环这里是遍历当前 bus 上的所有桥设备(不包括 endpoint 设备!) */for_each_pci_bridge(dev, bus) {unsigned int buses = 0;if (!hotplug_bridges && normal_bridges == 1) {/** There is only one bridge on the bus (upstream* port) so it gets all available buses which it* can then distribute to the possible hotplug* bridges below.*/buses = available_buses;} else if (dev->is_hotplug_bridge) {/** Distribute the extra buses between hotplug* bridges if any.*/buses = available_buses / hotplug_bridges;buses = min(buses, available_buses - used_buses + 1);}cmax = max;/* 这里才是真的是递归遍历下一级 bus,通过 buses 参数传递 bus 号 */max = pci_scan_bridge_extend(bus, dev, cmax, buses, 1);/* One bus is already accounted so don't add it again */if (max - cmax > 1)used_buses += max - cmax - 1;}......
}
3、pci_scan_single_device
pci_scan_single_device
函数,是初始化 pcie 设备的重中之重。本篇其余部分都是 pcie 设备枚举的过程,只有这个函数是配置函数。
struct pci_dev *pci_scan_single_device(struct pci_bus *bus, int devfn)
{struct pci_dev *dev;dev = pci_get_slot(bus, devfn);if (dev) {pci_dev_put(dev);return dev;}/* * 为当前设备创建 pci_dev 结构* 同时初始化当前设备的 BAR 资源、中断资源 */dev = pci_scan_device(bus, devfn);if (!dev)return NULL;/* 初始化当前设备的 capabilities 功能 */pci_device_add(dev, bus);return dev;
}
pci_device_add()+-> pci_init_capabilities()+-> pci_ea_init+-> pci_configure_ari+-> pci_iov_init+-> ......
4、pci_scan_bridge_extend
- 该函数入参 bus 为 pci_bus 结构,表示一个总线资源。该函数首次被调用时,传递下来的是 root bus 结构
- 该函数作用为扫描并创建当前 bus (入参)的下级 bus
- 在
pci_add_new_bus
函数中会去创建新的 bus 结构 - dev 结构,是 pci 设备的 pci_dev 结构。注意,这里的 pci 设备只会是 pci 桥设备,不会是 endpoint 设备
- 返回值为新的 subordinate number(根据不断递归获取到当前 bus 下的最最深一级的 bus 号)
/** pci_scan_bridge_extend() - Scan buses behind a bridge* @bus: Parent bus the bridge is on* @dev: Bridge itself* @max: Starting subordinate number of buses behind this bridge* @available_buses: Total number of buses available for this bridge and* the devices below. After the minimal bus space has* been allocated the remaining buses will be* distributed equally between hotplug-capable bridges.* @pass: Either %0 (scan already configured bridges) or %1 (scan bridges* that need to be reconfigured.** If it's a bridge, configure it and scan the bus behind it.* For CardBus bridges, we don't scan behind as the devices will* be handled by the bridge driver itself.** We need to process bridges in two passes -- first we scan those* already configured by the BIOS and after we are done with all of* them, we proceed to assigning numbers to the remaining buses in* order to avoid overlaps between old and new bus numbers.** Return: New subordinate number covering all buses behind this bridge.*/
static int pci_scan_bridge_extend(struct pci_bus *bus, struct pci_dev *dev,int max, unsigned int available_buses,int pass)
{......pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);primary = buses & 0xFF;secondary = (buses >> 8) & 0xFF;subordinate = (buses >> 16) & 0xFF;....../* 这里的 if 分支,含义是在 BIOS/Boot 没有配置的情况下,当前桥设备 pci_dev 的配置空间读出非 0 */if ((secondary || subordinate) && !pcibios_assign_all_busses() &&!is_cardbus && !broken) {unsigned int cmax;/** Bus already configured by firmware, process it in the* first pass and just note the configuration.*/if (pass)goto out;/** The bus might already exist for two reasons: Either we* are rescanning the bus or the bus is reachable through* more than one bridge. The second case can happen with* the i450NX chipset.*//* 这里会去为下一级 bus 创建 pci_bus 结构,下一级 bus 的 bus 号为 secondary */child = pci_find_bus(pci_domain_nr(bus), secondary);if (!child) {child = pci_add_new_bus(bus, dev, secondary);if (!child)goto out;child->primary = primary;pci_bus_insert_busn_res(child, secondary, subordinate);child->bridge_ctl = bctl;}/* 递归入口,这里的 child 已经下一级的 bus 了 */cmax = pci_scan_child_bus(child);if (cmax > subordinate)pci_warn(dev, "bridge has subordinate %02x but max busn %02x\n",subordinate, cmax);/* Subordinate should equal child->busn_res.end */if (subordinate > max)max = subordinate;} else {/* 这里的 else 分支,含义是在 BIOS/Boot 已经配置的情况下,或者当前桥设备 pci_dev 的配置空间读出为 0 *//** We need to assign a number to this bus which we always* do in the second pass.*/if (!pass) {if (pcibios_assign_all_busses() || broken || is_cardbus)/** Temporarily disable forwarding of the* configuration cycles on all bridges in* this bus segment to avoid possible* conflicts in the second pass between two* bridges programmed with overlapping bus* ranges.*//* * 这里是为了解决 bus 号冲突问题。因为 BIOS 已经配置好桥设备的 bus 资源,* 但因为现在操作系统又在重新配置,可能会和 BIOS 原先的配置有冲突,* 所以这里对 bus 资源先进行了一个复位操作,全写 0 */pci_write_config_dword(dev, PCI_PRIMARY_BUS,buses & ~0xffffff);goto out;}/* Clear errors */pci_write_config_word(dev, PCI_STATUS, 0xffff);/* Read bus numbers from EA Capability (if present) */fixed_buses = pci_ea_fixed_busnrs(dev, &fixed_sec, &fixed_sub);if (fixed_buses)next_busnr = fixed_sec;elsenext_busnr = max + 1; /* 更新下一级 bus 的 bus 号 *//** Prevent assigning a bus number that already exists.* This can happen when a bridge is hot-plugged, so in this* case we only re-scan this bus.*//* 这里会去为下一级 bus 创建 pci_bus 结构,下一级 bus 的 bus 号为 next_busnr*/child = pci_find_bus(pci_domain_nr(bus), next_busnr);if (!child) {child = pci_add_new_bus(bus, dev, next_busnr);if (!child)goto out;pci_bus_insert_busn_res(child, next_busnr,bus->busn_res.end);}max++;if (available_buses)available_buses--;/* 这里会去更新当前桥设备的 pri、sec、sub 寄存器(这里的 sub 还默认是 0xff) */buses = (buses & 0xff000000)| ((unsigned int)(child->primary) << 0)| ((unsigned int)(child->busn_res.start) << 8)| ((unsigned int)(child->busn_res.end) << 16);....../* We need to blast all three values with a single write */pci_write_config_dword(dev, PCI_PRIMARY_BUS, buses);if (!is_cardbus) {child->bridge_ctl = bctl;/* 递归入口,这里的 child 已经下一级的 bus 了 */max = pci_scan_child_bus_extend(child, available_buses);} else {......}/** Set subordinate bus number to its real value.* If fixed subordinate bus number exists from EA* capability then use it.*/if (fixed_buses)max = fixed_sub;pci_bus_update_busn_res_end(child, max);/* 递归结束,会根据递归得到的 max 值去修改 sub 寄存器的值 */pci_write_config_byte(dev, PCI_SUBORDINATE_BUS, max);}
......
}
5、举例
以上面这张图举例,Linux 下的 PCIe 设备枚举顺序为:
- 调用
pci_scan_child_bus_extend
创建 bus 2 上的设备,创建出 root bus 设备(2,0,0) - 调用
pci_scan_bridge_extend
创建下一级 bus 3 - 再次调用
pci_scan_child_bus_extend
,创建 bus 3 上的设备,创建出 upstream port(3,0,0)[这里开始第一次递归] - 再调用
pci_scan_bridge_extend
创建下一级 bus 4 - 再次调用
pci_scan_child_bus_extend
,创建 bus 4 上的设备,创建出 downstream port(4,1,0)、(4,2,0)…(4,18,0) [这里开始第二次递归]
要注意,这里会一次性创建出当前 bus 上所有扫描到的设备
- 再调用
pci_scan_bridge_extend
创建下一级 bus 5 - 再次调用
pci_scan_child_bus_extend
,创建 bus 5 上的设备,创建出 endpoint 设备(5,0,0)[这里开始第一次递归返回,返回的位置就是 for_each_pci_bridge 这个循环]
如果 (5,0,0)是一个 switch 的话,当尝试创建 bus 6 时,实际上就等同于创建 upstream port (3,0,0)的过程,这时会接着往下深度遍历(不会往右遍历)。遍历结束,根据遍历的返回值(也就是递归的返回值),原图中的 bus 6,这时就有可能变成 bus 8、bus 9…
- 再次调用
pci_scan_child_bus_extend
,创建 bus 6 上的设备,创建出 endpoint 设备(6,0,0) - …