通过Search模块针对预设的拓扑进行搜索之后,可得到不同拓扑的通道以及每个通道涉及的GPU或者NET节点。Connect模块将这些节点根据逻辑拓扑类型连接起来,完成逻辑拓扑构建。
数据结构
ncclTopoRanks是Connect模块主要的数据结构,用于收集各Rank的拓扑信息。
struct ncclTopoRanks {
int ringRecv[MAXCHANNELS]; // Ring head for each channel
int ringSend[MAXCHANNELS]; // Ring tail for each channel
int ringPrev[MAXCHANNELS]; // Previous GPU in the ring of each channel
int ringNext[MAXCHANNELS]; // Next GPU in the ring of each channel
int treeToParent[MAXCHANNELS]; // Tree parent for each channel
int treeToChild0[MAXCHANNELS]; // First child for each channel
int treeToChild1[MAXCHANNELS]; // Second child for each channel
int nvlsHeads[MAXCHANNELS]; // NVLS head for each channel
int nvlsHeadNum; // Number of NVLS heads
};
接口
Connect模块的主要接口是ncclTopoPreset和ncclTopoPostset,ncclTopoPreset用于初始化ncclTopoRanks结构并完成Intra节点相关信息构建;ncclTopoPostset系在完成ncclTopoPreset并通过BoostrapNet接口完成跨节点ncclTopoRanks数据同步后完成逻辑拓扑的构建。
ncclTopoPreset
各Rank通过ncclTopoPreset根据搜索出的ncclTopoGraph信息配置ncclTopoRanks。该函数主要完成ncclTopoRanks的初始化操作和Intra节点的连接工作,最后将Communicator的Channel数翻倍( 此前是通过comm->nChannels = std::min(treeGraph.nChannels, ringGraph.nChannels);初始化的,因为ringGraph.maxChannels = MAXCHANNELS/2; 所以翻倍Channel树不会超过MAXCHANNELS)。Channel数即并行度,Channel数越多通信数据切片越细并行度越高,可同时使用的处理器越多。
ncclResult_t ncclTopoPreset(struct ncclComm* comm, struct ncclTopoGraph** graphs, struct ncclTopoRanks* topoRanks) {
int rank = comm->rank;
int localRanks = comm->topo->nodes[GPU].count;
int nvlsRanks = comm->MNNVL ? comm->clique.size : localRanks;
int nChannels = comm->nChannels;
topoRanks->nvlsHeadNum = 0;
for (int c=0; c<nChannels; c++) {
// clear channel
struct ncclChannel* channel = comm->channels+c;
channel->ring.prev = channel->ring.next = -1;
channel->tree.up = -1;
channel->collnetChain.up = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->tree.down[i] = -1;
for (int i=0; i<NCCL_MAX_TREE_ARITY; i++) channel->collnetChain.down[i] = -1;
channel->collnetDirect.out = -1;
channel->collnetDirect.headRank = -1;
channel->collnetDirect.nHeads = 0;
channel->collnetDirect.shift = 0;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY+1; i++) channel->collnetDirect.heads[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.up[i] = -1;
for (int i=0; i<NCCL_MAX_DIRECT_ARITY; i++) channel->collnetDirect.down[i] = -1;
// find channel's intra-node info
int* ringIntra = graphs[NCCL_ALGO_RING]->intra+c*localRanks;
int* treeIntra = graphs[NCCL_ALGO_TREE]->intra+c*localRanks;
int* collNetIntra = graphs[NCCL_ALGO_COLLNET_CHAIN]->intra+c*localRanks;
// Only setup the topology for the current rank
for (int i=0; i<localRanks; i++) {
// Setup ring info
if (ringIntra[i] == rank) {
topoRanks->ringRecv[c] = ringIntra[0]; // ring head, always the first node, done
topoRanks->ringSend[c] = ringIntra[localRanks-1]; // ring tail, always the last node, done
topoRanks->ringPrev[c] = (i == 0) ? -1 : ringIntra[i-1]; // ring prev gpu, if head, set to -1, will be set later
topoRanks->ringNext[c] = (i == localRanks-1) ? -1 : ringIntra[i+1]; // ring next gpu, if tail, set to -1, will be set later
}
// Setup tree info
if (treeIntra[i] == rank) {
int parentIndex = 0;
int child0Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_TREE ? 0 : 1;
int child1Index = graphs[NCCL_ALGO_TREE]->pattern == NCCL_TOPO_PATTERN_SPLIT_TREE ? 1 : 0;
topoRanks->treeToParent[c] = treeIntra[parentIndex]; // tree parent is always the first gpu
topoRanks->treeToChild0[c] = treeIntra[child0Index]; // tree child0 is the first gpu or second gpu
topoRanks->treeToChild1[c] = treeIntra[child1Index]; // tree child1 is the second gpu or first gpu
channel->tree.up = i == 0 ? -1 : treeIntra[i-1]; // prev gpu, if head, set to -1
channel->tree.down[0] = i == localRanks-1 ? -1 : treeIntra[i+1]; // next gpu, if tail, set to -1
}
// Setup collnet info
if (collNetIntra[i] == rank) {
channel->collnetChain.up = i == 0 ? comm->nRanks : collNetIntra[i-1]; // prev gpu, if head, set to nranks
channel->collnetChain.down[0] = i == localRanks-1 ? -1 : collNetIntra[i+1]; // next gpu, if tail, set to -1
}
}
}
// Duplicate channels trees
struct ncclChannel* channel0 = comm->channels;
struct ncclChannel* channel1 = channel0+nChannels;
memcpy(channel1, channel0, nChannels*sizeof(struct ncclChannel));
// Get nvls heads and the number of heads. Duplicate head is not allowed.
for (int c = 0; c < graphs[NCCL_ALGO_NVLS]->nChannels; ++c) {
bool addHead = true;
int* nvlsIntra = graphs[NCCL_ALGO_NVLS]->intra + c * nvlsRanks;
// check if the head is already in the list
for (int dup = 0; dup < topoRanks->nvlsHeadNum; dup++) {
if (topoRanks->nvlsHeads[dup] == nvlsIntra[0]) {
addHead = false;
break;
}
}
if (addHead) {
topoRanks->nvlsHeads[topoRanks->nvlsHeadNum++] = nvlsIntra[0];
}
}
return ncclSuccess;
}
ncclTopoPostset
ncclTopoPostset在通过Bootstrap的AllGather获取所有ncclTopoRanks信息后,完成了comm->nNodes,comm->rankToNode,comm->rankToLocalRank,comm->nodeRanks,comm->maxLocalRanks以及firstRanks的初始化,在ncclTopoPostset中将Inter节点连接起来,完成逻辑拓扑的构建,最后根据用户配置的环境变量如NCCL_MIN_NRINGS/NCCL_MAX_NRINGS等信息调整Channel数,即并行度作为后续Kernel调度时的超参。
ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, int* treePatterns, struct ncclTopoRanks** allTopoRanks, int* rings, struct ncclTopoGraph** graphs, struct ncclComm* parent) {
// Gather data from all ranks
int *ringRecv, *ringSend, *ringPrev, *ringNext, *treeToParent, *treeToChild0, *treeToChild1, *nvlsHeads;
int nranks = comm->nRanks;
int nNodes = comm->nNodes;
int nChannels = comm->nChannels;
int minHeadNum = INT_MAX;
int shared = parent && parent->nvlsSupport && parent->config.splitShare;
NCCLCHECK(ncclCalloc(&ringRecv, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringSend, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringPrev, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&ringNext, nranks*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToParent, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild0, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&treeToChild1, nNodes*MAXCHANNELS));
NCCLCHECK(ncclCalloc(&nvlsHeads, nNodes*MAXCHANNELS));
// Alternate rings to avoid crossing rails
if (graphs[NCCL_ALGO_RING]->crossNic && (nChannels % 2) == 0) {
for (int r=0; r<comm->nRanks; r++) {
// for odd nodes, exchange rings
if (comm->rankToNode[r] % 2 == 1) {
// exchange 2 channels at a time
for (int c=0; c<nChannels; c+=2) {
exchangeValues(allTopoRanks[r]->ringRecv+c, allTopoRanks[r]->ringRecv+(c^1));
exchangeValues(allTopoRanks[r]->ringSend+c, allTopoRanks[r]->ringSend+(c^1));
exchangeValues(allTopoRanks[r]->ringPrev+c, allTopoRanks[r]->ringPrev+(c^1));
exchangeValues(allTopoRanks[r]->ringNext+c, allTopoRanks[r]->ringNext+(c^1));
}
}
}
}
for (int c=0; c<nChannels;c++) {
// Nodewise copy
for (int n=0; n<nNodes; n++) {
int r = firstRanks[n];
ringRecv[c*nNodes+n] = allTopoRanks[r]->ringRecv[c];
ringSend[c*nNodes+n] = allTopoRanks[r]->ringSend[c];
treeToParent[c*nNodes+n] = allTopoRanks[r]->treeToParent[c];
treeToChild0[c*nNodes+n] = allTopoRanks[r]->treeToChild0[c];
treeToChild1[c*nNodes+n] = allTopoRanks[r]->treeToChild1[c];
}
// Rankwise copy
for (int r=0; r<nranks; r++) {
ringPrev[c*nranks+r] = allTopoRanks[r]->ringPrev[c];
ringNext[c*nranks+r] = allTopoRanks[r]->ringNext[c];
}
}
for (int n = 0; n < nNodes; n++) {
int r = firstRanks[n];
if (minHeadNum > allTopoRanks[r]->nvlsHeadNum)
minHeadNum = allTopoRanks[r]->nvlsHeadNum;
}
for (int c = 0; c < minHeadNum; c++) {
// Nodewise copy
for (int n = 0; n < nNodes; n++) {
int r = firstRanks[n];
nvlsHeads[c * nNodes + n] = allTopoRanks[r]->nvlsHeads[c];
}
}
// Connect rings and trees. This should also duplicate the channels.
NCCLCHECK(connectRings(comm, ringRecv, ringSend, ringPrev, ringNext)); // modify ringPrev/ringNext for head/tail
NCCLCHECK(connectTrees(comm, treeToParent, treeToChild0, treeToChild1, treePatterns));
// Duplicate ringPrev/ringNext for ncclBuildRing
memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
// Set ring prev/next for my rank
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
channel0->ring.prev = channel1->ring.prev = ringPrev[c*nranks+comm->rank];
channel0->ring.next = channel1->ring.next = ringNext[c*nranks+comm->rank];
}
// Duplication should be complete now
nChannels = comm->nChannels = std::min(MAXCHANNELS,nChannels*2);
// Setup CollNet
if (comm->collNetSupport == 1) {
struct ncclTopoGraph* collNetGraph = graphs[NCCL_ALGO_COLLNET_DIRECT];
// Add more channels to saturate intra-node bandwidth, except the 1 PPN case
if (collNetGraph->bwIntra > collNetGraph->bwInter && comm->nRanks > comm->nNodes) {
int collNetNchannels = std::min(MAXCHANNELS, nChannels+nChannels/2);
nChannels = comm->nChannels = copyChannels(comm, nChannels, collNetNchannels, ringPrev, ringNext);
}
NCCLCHECK(connectCollNet(comm, collNetGraph));
}
// Might increase Ring channels if we have high BW intra-node
// Use 4 compute channels per search channel to reach peak BW on <8 PPN
if (comm->minCompCap == 90 && comm->nNodes > 1 && graphs[NCCL_ALGO_RING]->bwIntra > 45.0 && nChannels < 16) {
nChannels = comm->nChannels = copyChannels(comm, nChannels, 2*nChannels, ringPrev, ringNext);
}
// Honor NCCL_MIN_NRINGS/NCCL_MAX_NRINGS.
// We permit combining max, then min, to only use the first channels, then duplicate them.
if (comm->sharedRes->owner != comm) {
/* child comm #channels cannot exceed top parent #channels. */
nChannels = comm->nChannels = std::min(std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs), comm->sharedRes->tpNChannels);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::min(std::max(ncclMinNchannels(), comm->config.minCTAs), comm->sharedRes->tpNChannels), ringPrev, ringNext);
} else {
nChannels = comm->nChannels = std::min(std::min(ncclMaxNchannels(), nChannels), comm->config.maxCTAs);
nChannels = comm->nChannels = copyChannels(comm, nChannels, std::max(ncclMinNchannels(), comm->config.minCTAs), ringPrev, ringNext);
}
comm->collChannels = comm->nChannels;
#if CUDART_VERSION >= 12010
// Support maximal channel usage for aggregation
if (shared && comm->nvlsChannels > parent->nvlsResources->nChannels) {
comm->nvlsChannels = parent->nvlsResources->nChannels;
}
if (comm->nChannels < comm->nvlsChannels) {
nChannels = comm->nChannels = copyChannels(comm, comm->nChannels, comm->nvlsChannels, ringPrev, ringNext);
}
NCCLCHECK(connectNvls(comm, nvlsHeads, minHeadNum));
#endif
if (shared && comm->nChannels > parent->sharedRes->tpNChannels) {
nChannels = comm->nChannels = parent->sharedRes->tpNChannels;
comm->collChannels = std::min(comm->collChannels, comm->nChannels);
}
// Create rings array and check all is fine
NCCLCHECK(ncclBuildRings(nChannels, rings, comm->rank, comm->nRanks, ringPrev, ringNext));
free(ringRecv);
free(ringSend);
free(ringPrev);
free(ringNext);
free(treeToParent);
free(treeToChild0);
free(treeToChild1);
free(nvlsHeads);
return ncclSuccess;
}
connectRings
connectRings根据ringSend和ringRecv完成Inter节点的ringPrev和ringNext连接。Intra节点的连接在ncclTopoPreset时完成。
// Build global rings based on the local rings, finish ring prev/next for all ranks
static ncclResult_t connectRings(struct ncclComm* comm, int* ringRecv, int* ringSend, int* ringPrev, int* ringNext) {
int nChannels = comm->nChannels;
int nNodes = comm->nNodes;
// Connect rings for all channels
for (int c=0; c<nChannels; c++) {
// get the ring recv and send ranks of the current channel
int* recv = ringRecv+c*comm->nNodes;
int* send = ringSend+c*comm->nNodes;
int* prev = ringPrev+c*comm->nRanks;
int* next = ringNext+c*comm->nRanks;
// Connect the ring of the current channel cross all nodes
for (int n=0; n<nNodes; n++) {
int recvRank = recv[n];
int prevSendRank = send[(n-1+nNodes)%nNodes];
// set the prev rank of the local ring head to last node's send rank
prev[recvRank] = prevSendRank;
int sendRank = send[n];
int nextRecvRank = recv[(n+1)%nNodes];
// set the next rank of the local ring tail to next node's recv rank
next[sendRank] = nextRecvRank;
}
}
return ncclSuccess;
}
connectTrees
connectTrees完成Inter节点的连接,通过ncclGetDtree函数查询Double Binary Tree的下标,通过setTreeUp和setTreeDown函数将树连接起来。树的Intra节点连接亦在ncclTopoPreset时完成。
static ncclResult_t connectTrees(struct ncclComm* comm, int* treeToParent, int* treeToChild0, int* treeToChild1, int* treePatterns) {
const int nChannels = comm->nChannels, nNodes = comm->nNodes, node = comm->node;
// Compute tree depth. Not an exact value but a good approximation in most
// cases
int depth = comm->nRanks/nNodes - 1 + log2i(nNodes);
int t0u, t0d0, t0d1, t0ChildType, t1u, t1d0, t1d1, t1ChildType;
int* ttp, *ttc0, *ttc1;
// Get Double Binary Tree parent/children indexes
NCCLCHECK(ncclGetDtree(nNodes, node, &t0u, &t0d0, &t0d1, &t0ChildType, &t1u, &t1d0, &t1d1, &t1ChildType));
for (int c=0; c<nChannels; c++) {
struct ncclChannel* channel0 = comm->channels+c;
struct ncclChannel* channel1 = channel0+nChannels;
ttp = treeToParent+c*comm->nNodes;
ttc0 = treeToChild0+c*comm->nNodes;
ttc1 = treeToChild1+c*comm->nNodes;
if (comm->rank == ttp[node]) {
NCCLCHECK(setTreeUp(&channel0->tree, t0ChildType == 0 ? ttc0 : ttc1, t0u));
NCCLCHECK(setTreeUp(&channel1->tree, t1ChildType == 0 ? ttc0 : ttc1, t1u));
}
if (comm->rank == ttc0[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d0));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d0));
}
if (comm->rank == ttc1[node]) {
NCCLCHECK(setTreeDown(&channel0->tree, ttp, t0d1));
NCCLCHECK(setTreeDown(&channel1->tree, ttp, t1d1));
}
if (comm->rank == ttp[node] ||
comm->rank == ttc0[node] ||
comm->rank == ttc1[node]) {
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c, channel0->tree.up, comm->rank, channel0->tree.down[0], channel0->tree.down[1], channel0->tree.down[2]);
INFO(NCCL_GRAPH, "Tree %d : %d -> %d -> %d/%d/%d", c+nChannels, channel1->tree.up, comm->rank, channel1->tree.down[0], channel1->tree.down[1], channel1->tree.down[2]);
}
channel0->tree.depth = channel1->tree.depth = depth;
}
return ncclSuccess;
}
ncclGetDtree
ncclGetDtree用于查询Double Binary Tree的节点下标信息。输入nranks和rank,输出两份up节点和down节点的下标对应Double Binary Tree的两颗树。其内部实现是通过两次调用ncclGetBtree完成,每次调用获得一个树的up节点和down节点下标。
Double Binary Tree的想法主要是利用好节点之间的双向带宽,比如做Reduce操作,所有节点将数据汇总到跟节点,例子中第一颗树是奇数节点向偶数节点发送数据,第二颗树是偶数节点向奇数节点发送数据,如此这般两颗树的数据传输方向相反,在物理链路上是可以同时执行的,以此提高总线带宽利用率。
因此,ncclGetDtree主要便是要构造两颗可以上下行带宽错开的树,该函数针对nrank的奇偶数对第二颗树提供了不同的生成方式,对于奇数个节点,如图例所示,只需将rank数列往大了移动一个数值即可,偶数加一为奇数,奇数加一为偶数,如此奇偶位置互换。对于偶数节点,则是生成镜像树,第一颗树的某一个节点和镜像树对应节点下标加起来是一个奇数恒定值(图例里是11),奇数=奇数+偶数,镜像位置节点的下标奇偶性必然不同,确保生成的树可以充分利用上下行带宽。
以上算法满足构造一棵奇数节点均是叶子节点而另一棵树奇数节点均是非叶子节点的两颗树,便于充分利用总线的上行和下行带宽,笔者认为只要能生成两颗上下行带宽错开的树即可,此处只是一种算法。
/* Build a double binary tree. Take the previous tree for the first tree.
* For the second tree, we use a mirror tree (if nranks is even)
*
* 0---------------8 3----------------11
* ______/ \ / \______
* 4 \ / 7
* / \ \ / / \
* 2 6 10 1 5 9
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 0 2 4 6 8 10
*
* or shift it by one rank (if nranks is odd).
*
* 0---------------8 1---------------9
* ______/ \______ ______/ \______
* 4 12 5 0
* / \ / / \ /
* 2 6 10 3 7 11
* / \ / \ / \ / \ / \ / \
* 1 3 5 7 9 11 2 4 6 8 10 12
*/
ncclResult_t ncclGetDtree(int nranks, int rank, int* s0, int* d0_0, int* d0_1, int* parentChildType0, int* s1, int* d1_0, int* d1_1, int* parentChildType1) {
// First tree ... use a btree
ncclGetBtree(nranks, rank, s0, d0_0, d0_1, parentChildType0);
// Second tree ... mirror or shift
if (nranks % 2 == 1) {
// shift
int shiftrank = (rank-1+nranks) % nranks;
int u, d0, d1;
ncclGetBtree(nranks, shiftrank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : (u+1) % nranks;
*d1_0 = d0 == -1 ? -1 : (d0+1) % nranks;
*d1_1 = d1 == -1 ? -1 : (d1+1) % nranks;
} else {
// mirror
int u, d0, d1;
ncclGetBtree(nranks, nranks-1-rank, &u, &d0, &d1, parentChildType1);
*s1 = u == -1 ? -1 : nranks-1-u;
*d1_0 = d0 == -1 ? -1 : nranks-1-d0;
*d1_1 = d1 == -1 ? -1 : nranks-1-d1;
}
return ncclSuccess;
}
ncclGetBtree
ncclGetBtree是Binary Tree的生成部分,输入nranks和rank输出up节点下标和两个down节点下标。生成树算法将第一个下标作为Root节点,剩下的其他节点构造成一颗满二叉树,满二叉树的根节点向Root节点连接完成Binary Tree的构造。
如下图所示,根据给定一个nrank,这颗二叉树便确定了,给定一个rank根据规律,将rank的up和down节点下标计算出来。
代码注释中总结了查找up节点和down节点的规律。除开0000,树的非叶子节点忽略末尾多余的0的前提下,均以10结尾,而非叶子节点则无10结尾。忽略末尾多余的0,01节点和11节点的parent即up节点是10(特殊情况,01的找up节点10时发现up节点下标大于nrank,则使用00替代01);忽略末尾多余的0,10节点的两个孩子分别是01和11,如果节点没有10结尾意味着是叶子节点。根据以上规律给定一个rank,将其转换成二进制后换算出up和down的下标。
/* Btree which alternates leaves and nodes.
* Assumes root is 0, which conveniently builds a tree on powers of two,
* (because we have pow2-1 ranks) which lets us manipulate bits.
* Find first non-zero bit, then :
* Find the parent :
* xx01[0] -> xx10[0] (1,5,9 below) or xx00[0] if xx10[0] is out of bounds (13 below)
* xx11[0] -> xx10[0] (3,7,11 below)
* Find the children :
* xx10[0] -> xx01[0] (2,4,6,8,10,12) or -1 (1,3,5,7,9,11,13)
* xx10[0] -> xx11[0] (2,4,6,8,10) or xx101[0] (12) or xx1001[0] ... or -1 (1,3,5,7,9,11,13)
*
* Illustration :
* 0---------------8
* ______/ \______
* 4 12
* / \ / \
* 2 6 10 \
* / \ / \ / \ \
* 1 3 5 7 9 11 13
*/
ncclResult_t ncclGetBtree(int nranks, int rank, int* u, int* d0, int* d1, int* parentChildType) {
int up, down0, down1;
int bit;
for (bit=1; bit<nranks; bit<<=1) {
if (bit & rank) break;
}
if (rank == 0) {
*u = -1;
*d0 = -1;
// Child rank is > 0 so it has to be our child 1, not 0.
*d1 = nranks > 1 ? bit >> 1 : -1;
return ncclSuccess;
}
up = (rank ^ bit) | (bit << 1);
// if smaller than the parent, we are his first child, otherwise we're his second
if (up >= nranks) up = (rank ^ bit);
*parentChildType = (rank < up) ? 0 : 1;
*u = up;
int lowbit = bit >> 1;
// down0 is always within bounds
down0 = lowbit == 0 ? -1 : rank-lowbit;
down1 = lowbit == 0 ? -1 : rank+lowbit;
// Make sure down1 is within bounds
while (down1 >= nranks) {
down1 = lowbit == 0 ? -1 : rank+lowbit;
lowbit >>= 1;
}
*d0 = down0; *d1 = down1;
return ncclSuccess;
}