NCCL Path模块提供接口为Topo模块产生的的系统拓扑图的节点提供路径初始化。Path模块使用贪心算法(构造最少跳数且最大带宽的路径)对系统拓扑图中每一个节点到其他节点的路径进行初始化,为后续Search模块的逻辑拓扑搜索做准备。
连接和路径
路径和链接的概念不相同,链接是指相邻的节点使用何种方式连接(如PCI),路径是指任意两节点如何到达(如PHB,算PCIe互联,但路径上需经过CPU RC,因此受CPU RC P2P能力影响,需要特别表明)。Path模块主要考虑路径的初始化。
路径
路径的类型可以分为以下10种。
// Local (myself)
#define PATH_LOC 0
// Connection traversing NVLink
#define PATH_NVL 1
// Connection through NVLink using an intermediate GPU
#define PATH_NVB 2
// Connection traversing at most a single PCIe bridge
#define PATH_PIX 3
// Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge)
#define PATH_PXB 4
// Connection between a GPU and a NIC using an intermediate GPU. Used to enable rail-local, aggregated network send/recv operations.
#define PATH_PXN 5
// Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU)
#define PATH_PHB 6
// Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI)
#define PATH_SYS 7
// Connection through the network
#define PATH_NET 8
// Disconnected
#define PATH_DIS 9
节点的路径类型由Path模块的ncclTopoSetPaths设定,以下是设定路径类型的代码片段,展现了如果通过路径中间的链接类型确定路径类型的逻辑,注解在代码注释里。
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int type = link->type == LINK_NET ? LINK_LOC : link->type;
// Consider a path going through multiple PCI switches as PATH_PXB
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
// Consider a path going through the CPU as PATH_PHB
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
// Consider a path going through a single NVLink as PATH_NVB
if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
// Update path types
remPath->type = std::max(path->type, type);
接口
Path模块的关键的接口是ncclTopoComputePaths接口,ncclTopoSetPaths接口和ncclTopoTrimSystem接口。正是通过这些接口完成系统拓扑图中节点的路径信息初始化。
ncclTopoComputePaths
ncclTopoComputePaths接口对所有节点(除了PCI)使用贪心算法完成节点路径Path的初始化。
ncclResult_t ncclTopoComputePaths(struct ncclTopoSystem* system, struct ncclComm* comm) {
// Remove all the paths that were computed before
for (int t=0; t<NCCL_TOPO_NODE_TYPES; t++) ncclTopoRemovePathType(system, t);
// Set direct paths to CPUs. We need them in many cases.
for (int c=0; c<system->nodes[CPU].count; c++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[CPU].nodes+c, system));
}
// Set direct paths to GPUs.
for (int g=0; g<system->nodes[GPU].count; g++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[GPU].nodes+g, system));
}
// Set direct paths to NICs.
for (int n=0; n<system->nodes[NET].count; n++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[NET].nodes+n, system));
}
// Set direct paths to NVSwitches.
for (int n=0; n<system->nodes[NVS].count; n++) {
NCCLCHECK(ncclTopoSetPaths(system->nodes[NVS].nodes+n, system));
}
// Update path for GPUs when we don't want to / can't use GPU Direct P2P
for (int g=0; g<system->nodes[GPU].count; g++) {
for (int p=0; p<system->nodes[GPU].count; p++) {
int p2p;
NCCLCHECK(ncclTopoCheckP2p(system, system->nodes[GPU].nodes[p].id, system->nodes[GPU].nodes[g].id, &p2p, NULL, NULL));
if (p2p == 0) {
// Divert all traffic through the CPU
int cpu;
NCCLCHECK(getLocalCpu(system, g, &cpu));
NCCLCHECK(addInterStep(system, CPU, cpu, GPU, p, GPU, g));
}
}
if (comm == NULL) continue;
// Remove GPUs we can't (or don't want to) communicate with through P2P or SHM
struct ncclPeerInfo* dstInfo = comm->peerInfo+system->nodes[GPU].nodes[g].gpu.rank;
for (int p=0; p<system->nodes[GPU].count; p++) {
if (p == g) continue;
struct ncclPeerInfo* srcInfo = comm->peerInfo+system->nodes[GPU].nodes[p].gpu.rank;
int p2p;
NCCLCHECK(ncclTransports[TRANSPORT_P2P]->canConnect(&p2p, system, NULL, srcInfo, dstInfo));
if (p2p == 0) {
int shm;
NCCLCHECK(ncclTransports[TRANSPORT_SHM]->canConnect(&shm, system, NULL, srcInfo, dstInfo));
if (shm == 0) {
// Mark this peer as inaccessible. We'll trim it later.
system->nodes[GPU].nodes[p].paths[GPU][g].type = PATH_NET;
}
}
}
}
// Update paths for NICs (no GPU Direct, PXN, ...)
for (int n=0; n<system->nodes[NET].count; n++) {
struct ncclTopoNode* netNode = system->nodes[NET].nodes+n;
for (int g=0; g<system->nodes[GPU].count; g++) {
// Check whether we can access the NIC through another NVLink-connected GPU (PXN)
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
if (ncclPxnDisable(comm) != 1) {
int localGpuIndex;
NCCLCHECK(ncclTopoGetLocalGpu(system, netNode->id, &localGpuIndex));
if (localGpuIndex != g && localGpuIndex != -1) {
// PXN = PCI + NVLink.
struct ncclTopoNode* peerNode = system->nodes[GPU].nodes+localGpuIndex;
// Only use PXN for NIC n if remote GPU p ...
if (peerNode->paths[NET][n].type <= PATH_PXB && // Is connected to the NIC through PCI
peerNode->paths[GPU][g].type <= PATH_NVL && // Is connected to us through NVLink
NCCL_TOPO_ID_SYSTEM_ID(peerNode->id) == NCCL_TOPO_ID_SYSTEM_ID(gpu->id) && // Is on the same node as us
(peerNode->paths[NET][n].bw > gpu->paths[NET][n].bw || // Has either higher BW to that NIC
gpu->paths[NET][n].type > PATH_PXB)) // or avoids going through a CPU
// We can use that GPU as relay to communicate with that NIC.
// Only enabling it in the GPU->NIC direction for now to favor
// receiving locally and sending remotely (consistent with net.cc)
NCCLCHECK(addInterStep(system, GPU, localGpuIndex, GPU, g, NET, n));
}
}
if (gpu->paths[NET][n].type < PATH_PHB) {
// Update path when we dont want to / can't use GPU Direct RDMA.
int gdr;
NCCLCHECK(ncclTopoCheckGdr(system, system->nodes[GPU].nodes[g].id, netNode->id, 0, &gdr));
if (gdr == 0) {
// We cannot use GPU Direct RDMA, divert all traffic through the CPU local to the GPU
int localCpu;
NCCLCHECK(getLocalCpu(system, g, &localCpu));
NCCLCHECK(addInterStep(system, CPU, localCpu, NET, n, GPU, g));
NCCLCHECK(addInterStep(system, CPU, localCpu, GPU, g, NET, n));
}
}
}
}
return ncclSuccess;
}
ncclTopoSetPaths
ncclTopoSetPaths接口将baseNode节点作为树的Root,由baseNode节点按照树形的方式展开,进行逐层遍历(树的广度优先搜索),搜索过程中如发现当前的节点的下一层节点通过当前节点去到Root有更优的路径(跳数小且带宽大)则更新当前节点的下一层节点到Root的路径。
// Set paths from all other nodes to baseNodes
static ncclResult_t ncclTopoSetPaths(struct ncclTopoNode* baseNode, struct ncclTopoSystem* system) {
// Allocate paths for this type of nodes if needed
if (baseNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(baseNode->paths+baseNode->type, system->nodes[baseNode->type].count));
}
// breadth-first search to set all paths to that node in the system
struct ncclTopoNodeList nodeList;
struct ncclTopoNodeList nextNodeList;
nodeList.count = 1; nodeList.list[0] = baseNode;
nextNodeList.count = 0;
// Set the basenNode loop back path
struct ncclTopoLinkList* basePath;
NCCLCHECK(getPath(system, baseNode, baseNode->type, baseNode->id, &basePath));
basePath->count = 0;
basePath->bw = LOC_BW;
basePath->type = PATH_LOC;
while (nodeList.count) {
nextNodeList.count = 0;
// Iterate over all nodes
for (int n=0; n<nodeList.count; n++) {
struct ncclTopoNode* node = nodeList.list[n];
struct ncclTopoLinkList* path;
// path: node --> ... --> baseNode
NCCLCHECK(getPath(system, node, baseNode->type, baseNode->id, &path));
// Iterate over all links, update the path if needed and add the remNode to the list for the next iteration
// Links with larger bandwidth sitted in the front of the list
for (int l=0; l<node->nlinks; l++) {
// link: node -> remNode
struct ncclTopoLink* link = node->links+l;
struct ncclTopoNode* remNode = link->remNode;
// Allocate paths for this type of nodes if needed
if (remNode->paths[baseNode->type] == NULL) {
NCCLCHECK(ncclCalloc(remNode->paths+baseNode->type, system->nodes[baseNode->type].count));
for (int i=0; i<system->nodes[baseNode->type].count; i++) remNode->paths[baseNode->type][i].type = PATH_DIS;
}
// remPath: remNode -> ... -> baseNode
struct ncclTopoLinkList* remPath;
NCCLCHECK(getPath(system, remNode, baseNode->type, baseNode->id, &remPath));
// remPath: remNode -> baseNode = remNode -> node -> baseNode, so bw is min(node -> remNode, node -> baseNode)
float bw = std::min(path->bw, link->bw);
// allow routing through a GPU only as 1 hop, otherwise skip path setting
if (node != baseNode && node->type == GPU &&
(ncclParamNvbDisable() || link->type != LINK_NVL || remNode->type != GPU || path->count > 1)) continue;
// remPath->bw == 0: remPath is not set yet, should be setted
// remPath->count > path->count: remPath is longer, could be shorter
// remPath->bw < bw: remPath is with less bandwidth, remPath need update
if ((remPath->bw == 0 || remPath->count > path->count) && remPath->bw < bw) {
// Find reverse link: remNode -> node
for (int l=0; l<remNode->nlinks; l++) {
if (remNode->links[l].remNode == node && remNode->links[l].type == link->type) {
// first step of remPath is: remNode -> node
remPath->list[0] = remNode->links+l;
break;
}
}
if (remPath->list[0] == NULL) {
WARN("Failed to find reverse path from remNode %d/%lx nlinks %d to node %d/%lx",
remNode->type, remNode->id, remNode->nlinks, node->type, node->id);
return ncclInternalError;
}
// Copy the rest of the path, remPath = remNode -> node -> ... -> baseNode
for (int i=0; i<path->count; i++) remPath->list[i+1] = path->list[i];
remPath->count = path->count + 1;
remPath->bw = bw;
// Start with path type = link type. PATH and LINK types are supposed to match.
// Don't consider LINK_NET as we only care about the NIC->GPU path.
int type = link->type == LINK_NET ? LINK_LOC : link->type;
// Differentiate between one and multiple PCI switches
// Consider a path going through multiple PCI switches as PATH_PXB
if (node->type == PCI && remNode->type == PCI) type = PATH_PXB;
// Consider a path going through the CPU as PATH_PHB
if (link->type == LINK_PCI && (node->type == CPU || link->remNode->type == CPU)) type = PATH_PHB;
// Consider a path going through a single NVLink as PATH_NVB
if (node->type == GPU && path->type == PATH_NVL && type == PATH_NVL && remPath->count > 1) type = PATH_NVB;
// Update path types
remPath->type = std::max(path->type, type);
// Add to the list for the next iteration if not already in the lists
int i;
// Find the index of remNode in nextNodeList
for (i=0; i<nextNodeList.count; i++) if (nextNodeList.list[i] == remNode) break;
// If not found, add it to the list
if (i == nextNodeList.count) nextNodeList.list[nextNodeList.count++] = remNode;
}
}
}
// searh next level
memcpy(&nodeList, &nextNodeList, sizeof(nodeList));
}
return ncclSuccess;
}
ncclTopoTrimSystem
ncclTopoTrimSystem接口移除无法访问的GPU节点,如果所有GPU均在同一机器中则移除NIC节点(单机无须NIC参与),从而精简图节点进而减少搜索空间。
// Trim the topology to remove GPUs that are not accessible, remove all NICs if all GPUs are on the same domain
ncclResult_t ncclTopoTrimSystem(struct ncclTopoSystem* system, struct ncclComm* comm) {
int *domains;
int64_t *ids;
NCCLCHECK(ncclCalloc(&domains, system->nodes[GPU].count));
NCCLCHECK(ncclCalloc(&ids, system->nodes[GPU].count));
int myDomain = 0;
for (int g=0; g<system->nodes[GPU].count; g++) {
struct ncclTopoNode* gpu = system->nodes[GPU].nodes+g;
domains[g] = g;
ids[g] = gpu->id;
for (int p=0; p<g; p++) {
// If we have a local path to another GPU, we are in the same domain
if (gpu->paths[GPU][p].type < PATH_NET) {
domains[g] = std::min(domains[g], domains[p]);
}
}
if (gpu->gpu.rank == comm->rank) myDomain = domains[g];
}
int ngpus = system->nodes[GPU].count;
for (int i=0; i<ngpus; i++) {
if (domains[i] == myDomain) continue;
struct ncclTopoNode* gpu = NULL;
int g;
for (g=0; g<system->nodes[GPU].count /* This one varies over the loops */; g++) {
gpu = system->nodes[GPU].nodes+g;
if (gpu->id == ids[i]) break; else gpu=NULL;
}
if (gpu == NULL) {
WARN("Could not find id %lx", ids[i]);
free(domains);
free(ids);
return ncclInternalError;
}
// Remove all paths to/from this GPU
NCCLCHECK(ncclTopoRemoveNode(system, GPU, g));
}
// Remove all NICs if all GPUs are on the same domain
if (system->nodes[GPU].count == comm->nRanks) {
for (int n=system->nodes[NET].count-1; n>=0; n--)
NCCLCHECK(ncclTopoRemoveNode(system, NET, n));
}
free(domains);
free(ids);
return ncclSuccess;
}