概述
NCCL通过复杂的拓扑分析和性能模型来决定rank之间使用Ring、Tree还是CollNet连接。这个决策过程涉及硬件拓扑检测、性能建模、算法选择等多个步骤。
算法选择的核心流程
1. 拓扑图计算(ncclTopoCompute)
// src/graph/search.cc:967-1017
ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph) {
int ngpus = system->nodes[GPU].count;
// 1. 确定是否使用跨NIC连接
int crossNic = (system->nodes[NET].count > 1) &&
(graph->pattern == NCCL_TOPO_PATTERN_RING ||
graph->pattern == NCCL_TOPO_PATTERN_BALANCED_TREE ||
graph->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE) ? ncclParamCrossNic() : 0;
// 2. 获取GPU间的最小/最大路径类型
int minTypeIntra = PATH_LOC, minTypeInter = PATH_PIX;
int maxTypeIntra = PATH_SYS, maxTypeInter = PATH_SYS;
if (ngpus > 1) {
NCCLCHECK(ncclTopoGetGpuMinPath(system, GPU, &minTypeIntra));
NCCLCHECK(ncclTopoGetGpuMaxPath(system, GPU, &maxTypeIntra));
}
// 3. 特殊情况的算法选择
if (ngpus == 1) {
if (graph->pattern != NCCL_TOPO_PATTERN_RING)
graph->pattern = NCCL_TOPO_PATTERN_TREE;
}
// 4. NVLink分割检测
int splitNvLink;
NCCLCHECK(ncclTopoSplitNvLink(system, &splitNvLink));
if (graph->pattern == NCCL_TOPO_PATTERN_RING && splitNvLink) {
// 两个socket之间有NVLink和较慢的QPI连接
// Tree算法可能更好,但需要至少2个channel
if (graph->maxChannels >= 2 && graph->minChannels == 1)
graph->minChannels = 2;
}
}
2. 性能调优模型(ncclTopoTuneModel)
// src/graph/tuning.cc:213-300
ncclResult_t ncclTopoTuneModel(struct ncclComm* comm, int minCompCap, int maxCompCap, struct ncclTopoGraph** graphs) {
int nNodes = comm->nNodes;
int nRanks = comm->nRanks;
// 1. 为不同算法设置线程数
comm->maxThreads[NCCL_ALGO_RING][NCCL_PROTO_SIMPLE] = simpleDefaultThreads;
comm->maxThreads[NCCL_ALGO_TREE][NCCL_PROTO_SIMPLE] = NCCL_SIMPLE_MAX_NTHREADS;
comm->maxThreads[NCCL_ALGO_COLLNET_DIRECT][NCCL_PROTO_SIMPLE] = NCCL_MAX_NTHREADS;
// 2. 计算硬件相关参数
int compCapIndex = minCompCap >= 100 ? BLACKWELL_COMPCAP_IDX :
(minCompCap >= 90 ? HOPPER_COMPCAP_IDX :
minCompCap >= 80 ? AMPERE_COMPCAP_IDX : VOLTA_COMPCAP_IDX);
// 3. 为每个集合通信操作和算法组合计算性能
for (int coll=0; coll<NCCL_NUM_FUNCTIONS; coll++) {
for (int a=0; a<NCCL_NUM_ALGORITHMS; a++) {
// 算法限制检查
if ((coll == ncclFuncBroadcast || coll == ncclFuncReduce) && a != NCCL_ALGO_RING) continue;
if ((coll == ncclFuncReduceScatter || coll == ncclFuncAllGather)
&& a != NCCL_ALGO_PAT && a != NCCL_ALGO_RING
&& a != NCCL_ALGO_NVLS && a != NCCL_ALGO_COLLNET_DIRECT) continue;
for (int p=0; p<NCCL_NUM_PROTOCOLS; p++) {
// 协议限制检查
if ((a == NCCL_ALGO_NVLS || a == NCCL_ALGO_NVLS_TREE) && p != NCCL_PROTO_SIMPLE) continue;
// 计算带宽和延迟
float bw = nNodes <= 2 || collnet ? graphs[a]->bwIntra : graphs[a]->bwInter;
float busBw = graphs[a]->nChannels * bw;
// 各种模型细化
if (a == NCCL_ALGO_RING && p == NCCL_PROTO_LL) {
busBw = st

最低0.47元/天 解锁文章
334

被折叠的 条评论
为什么被折叠?



