作者|KIDGINBROOK
更新|潘丽晨
上节介绍所有节点执行了bootstrap网络连接的建立,接下来介绍下拓扑分析。
由于GPU机器架构是多种多样的,一台机器上可能有多个网卡,多个GPU卡,卡间连接也各不相同,因此需要对机器内设备连接拓扑进行分析,以使性能在各种拓扑结构下都尽可能好。
接着上回继续看initTransportsRank。
static ncclResult_t initTransportsRank(struct ncclComm* comm, ncclUniqueId* commId) {
// We use 3 AllGathers
// 1. { peerInfo, comm }
// 2. ConnectTransport[nranks], ConnectValue[nranks]
// 3. { nThreads, nrings, compCap, prev[MAXCHANNELS], next[MAXCHANNELS] }
int rank = comm->rank;
int nranks = comm->nRanks;
uint64_t commHash = getHash(commId->internal, NCCL_UNIQUE_ID_BYTES);
TRACE(NCCL_INIT, "comm %p, commHash %lx, rank %d nranks %d - BEGIN", comm, commHash, rank, nranks);
NCCLCHECK(bootstrapInit(commId, rank, nranks, &comm->bootstrap));
// AllGather1 - begin
struct {
struct ncclPeerInfo peerInfo;
struct ncclComm* comm;
} *allGather1Data;
NCCLCHECK(ncclCalloc(&allGather1Data, nranks));
allGather1Data[rank].comm = comm;
struct ncclPeerInfo* myInfo = &allGather1Data[rank].peerInfo;
NCCLCHECK(fillInfo(comm, myInfo, commHash));
...
}
创建nrank个allGather1Data,然后通过fillInfo 填充当前rank的peerInfo,ncclPeerInfo是rank的一些基本信息,比如rank号,在哪个机器的哪个进程等。
struct ncclPeerInfo {
int rank;
int cudaDev;
int gdrSupport;
uint64_t hostHash;
uint64_t pidHash;
dev_t shmDev;
int64_t busId;
};
static ncclResult_t fillInfo(struct ncclComm* comm, struct ncclPeerInfo* info, uint64_t commHash) {
info->rank = comm->rank;
CUDACHECK(cudaGetDevice(&info->cudaDev));
info->hostHash=getHostHash()+commHash;
info->pidHash=getPidHash()+commHash;
// Get the device MAJOR:MINOR of /dev/shm so we can use that
// information to decide whether we can use SHM for inter-process
// communication in a container environment
struct stat statbuf;
SYSCHECK(stat("/dev/shm", &statbuf), "stat");
info->shmDev = statbuf.st_dev;
info->busId = comm->busId;
NCCLCHECK(ncclGpuGdrSupport(&info->gdrSupport));
return ncclSuccess;
}
获取当前卡的rank,PCIe busId,/dev/shm的设备号,填充到ncclPeerInfo,然后通过ncclGpuGdrSupport查看是否支持gdr,rdma在通信前需要注册一段内存,使得网卡知道虚拟地址和物理地址的映射,但是如果每次通信都需要将data从显存拷贝到内存再通信的话效率就比较低。
而IB提供了peer memory的接口,使得ib网卡可以访问其他PCIe空间,nv基于peer memory实现了自己的驱动,使得rdma可以直接注册显存,这样通信就可以避免host和device的内存拷贝,IB可以直接dma显存,即gdr。
static ncclResult_t ncclGpuGdrSupport(int* gdrSupport) {
int netDevs;
NCCLCHECK(ncclNetDevices(&netDevs));
*gdrSupport = 0;
for (int dev=0; dev<netDevs; dev++) {
// Find a net device which is GDR-capable
ncclNetProperties_t props;
NCCLCHECK(ncclNet->getProperties(dev, &props));
if ((props.ptrSupport & NCCL_PTR_CUDA) == 0) continue;
// Allocate memory on the GPU and try to register it on the NIC.
void *lComm = NULL, *sComm = NULL, *rComm = NULL;
ncclNetHandle_t handle;
void* gpuPtr = NULL;
void* mHandle = NULL;
NCCLCHECK(ncclNetListen(dev, &handle, &lComm));
NCCLCHECK(ncclNetConnect(dev, &handle, &sComm));
NCCLCHECK(ncclNetAccept(lComm, &rComm));
CUDACHECK(cudaMalloc(&gpuPtr, GPU_BUF_SIZE));
ncclDebugNoWarn = NCCL_NET;
if (ncclNetRegMr(sComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle) == ncclSuccess) {
NCCLCHECK(ncclNetDeregMr(sComm, mHandle));
NCCLCHECK(ncclNetRegMr(rComm, gpuPtr, GPU_BUF_SIZE, NCCL_PTR_CUDA, &mHandle));
NCCLCHECK(ncclNetDeregMr(rComm, mHandle));
*gdrSupport = 1;
}
ncclDebugNoWarn = 0;
CUDACHECK(cudaFree(gpuPtr));
NCCLCHECK(ncclNetCloseRecv(rComm));
NCCLCHECK(ncclNetCloseSend(sComm));
NCCLCHECK(ncclNetCloseListen(lComm));
break;
}
return ncclSuccess;
}
这里会遍历每一个网卡,获取网卡的信息,由第一节可以知道这里的ncclNet就是ncclNetIb。
ncclResult_t ncclIbGdrSupport(int ibDev) {
static int moduleLoaded = -1;
if (moduleLoaded == -1) {
moduleLoaded = (access("/sys/kernel/mm/memory_peers/nv_mem/version", F_OK) == -1) ? 0 : 1;
}
if (moduleLoaded == 0) return ncclSystemError;
return ncclSuccess;
}
ncclResult_t ncclIbGetProperties(int dev, ncclNetProperties_t* props) {
props->name = ncclIbDevs[dev].devName;
props->pciPath = ncclIbDevs[dev].pciPath;
props->guid = ncclIbDevs[dev].guid;
props->ptrSupport = NCCL_PTR_HOST;
if (ncclIbGdrSupport(dev) != ncclSuccess) {
INFO(NCCL_NET,"NET/IB : GPU Direct RDMA Disabled for HCA %d '%s' (no module)", dev, ncclIbDevs[dev].devName);
} else {
props->ptrSupport |= NCCL_PTR_CUDA;
}
props->speed = ncclIbDevs[dev].speed;
props->port = ncclIbDevs[dev].port + ncclIbDevs[dev].realPort;
props->maxComms = ncclIbDevs[dev].maxQp;
return ncclSuccess;
}
这里主要是获取网卡名,PCIe路径,guid等信息,然后查看是否有/sys/kernel/mm/memory_peers/nv_mem/version判断是否安装了nv_peermem,即nv的驱动,如果安装了的话则设置props->ptrSupport |= NCCL_PTR_CUDA,表示可以注册显存。