mpi 与 nccl 多进程多卡单机示例结果验证

本文展示了如何利用C++编写一个程序,通过MPI进行进程间通信,使用CUDA和NCCL在多个GPU设备上执行AllReduce操作,实现高效的数据同步。程序首先计算主机名哈希值确定本地GPU,然后在所有进程中初始化GPU内存,通过NCCL进行AllReduce运算并同步数据。
摘要由CSDN通过智能技术生成

做了部分注释,比较乱

本示例结构:

1,源代码

#include <stdio.h>
#include "cuda_runtime.h"
#include "nccl.h"
#include "mpi.h"
#include <unistd.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/time.h>

#define SOCKET_SIZE 1

#if SOCKET_SIZE
#include <sys/socket.h>
#include <arpa/inet.h>
#include <netinet/tcp.h>
#include <netdb.h>
#include <fcntl.h>
#include <poll.h>
#endif

using namespace std;

#define MPI_CHECK(cmd) do {                          \
  int e = cmd;                                      \
  if( e != MPI_SUCCESS ) {                          \
    printf("Failed: MPI error %s:%d '%d'\n",        \
        __FILE__,__LINE__, e);   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)


#define CUDA_CHECK(cmd) do {                         \
  cudaError_t e = cmd;                              \
  if( e != cudaSuccess ) {                          \
    printf("Failed: Cuda error %s:%d '%s'\n",             \
        __FILE__,__LINE__,cudaGetErrorString(e));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)


#define NCCL_CHECK(cmd) do {                         \
  ncclResult_t r = cmd;                             \
  if (r!= ncclSuccess) {                            \
    printf("Failed, NCCL error %s:%d '%s'\n",             \
        __FILE__,__LINE__,ncclGetErrorString(r));   \
    exit(EXIT_FAILURE);                             \
  }                                                 \
} while(0)

/* DJB2a是一种简单的哈希算法,由计算机科学家Daniel J. Bernstein设计。
 * 它被广泛用于哈希表等数据结构中。该算法通过遍历输入字符串的每个字符,
 * 并结合一个常数(通常是33),来计算字符串的哈希值。
 * 它在计算速度和哈希碰撞方面表现良好,但不适用于加密目的。
**/
static uint64_t getHostHash(const char* string) {
  // Based on DJB2a, result = result * 33 ^ char
  uint64_t result = 5381;
  for (int c = 0; string[c] != '\0'; c++){
    result = ((result << 5) + result) ^ string[c];
  }
  return result;
}

static void getHostName(char* hostname, int maxlen) {
  gethostname(hostname, maxlen);//本函数声明于 /usr/include/unistd.h

  for (int i=0; i< maxlen; i++) {
    if (hostname[i] == '.') {
        hostname[i] = '\0';
        return;
    }
  }
}

float max__(float x, float y)
{
  return x>y? x:y;
}

float sum__(float x, float y)
{
  return x + y;
}

void print_vector(float* A, int n)
{
  for(int i=0; i<n; i++)
    printf("%.2f ", A[i]);
}

void init_dev_vectors(float* A_d, float* B_d, int n, int rank, long long seed)
{
  float * A = (float*)malloc(n*sizeof(float));
  float * B = (float*)malloc(n*sizeof(float));
  //float * M = (float*)malloc(n*sizeof(float));//max[i] = max(A[i], B[i]);
  //float * S = (float*)malloc(n*sizeof(float));//sum[i] = sum(A[i], B[i]);
  srand(seed);

  for(int i=0; i<n; i++)
  {
    A[i] = (rand()%100)/100.0f;
    B[i] = (rand()%100)/100.0f;
    //M[i] = max__(A[i], B[i]);
    //S[i] = sum__(A[i], B[i]);
  }

  printf("\nrank = %d, sendbuff =\n", rank);  print_vector(A, n);
//  printf("\nrank = %d, Sum =\n", rank);  print_vector(S, n);

  cudaMemcpy(A_d, A, n*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(B_d, B, n*sizeof(float), cudaMemcpyHostToDevice);

  free(A);
  free(B);
}

void fetch_dev_vector(float* A_d, int n, int rank)
{
  float* A = (float*)malloc(n*sizeof(float));
  cudaMemcpy(A, A_d, n*sizeof(float), cudaMemcpyDeviceToHost);
  printf("rank = %d, recvbuff =\n", rank);
  print_vector(A, n);
}

void  get_seed(long long &seed)
{
  struct timeval tv;
  gettimeofday(&tv, NULL);
  seed = (long long)tv.tv_sec * 1000*1000 + tv.tv_usec;//only second and usecond;
  printf("useconds:%lld\n", seed);
}


int main(int argc, char* argv[])
{
  int size = 16*16;//32*1024*1024;
  int myRank, nRanks, localRank = 0;

  //initializing MPI
  printf("argc = %d\n", argc);
  MPI_CHECK(MPI_Init(&argc, &argv));//本行之后便进入多线程状态,线程数由 mpirun -np 4 ./a.out 的这个4来指定
  MPI_CHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));// 本线程的线程序号:myRank = 0, 1, 2, 4-1;
  MPI_CHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));// 本次启动mpi 程序的总线程数 nRanks==4;
  cout<< "nRanks="<< nRanks<<endl;

  //calculating localRank based on hostname which is used in selecting a GPU
  uint64_t hostHashs[nRanks];//每个rank的主机名字的hash值,占据一个 uint64_t 元素,存储于 hostHashs[myRank] 中;
  cout<<"nRanks = "<<nRanks<<endl;
  char hostname[1024];
  getHostName(hostname, 1024);//

  cout<<"Host Name is "<<hostname<<endl;
  hostHashs[myRank] = getHostHash(hostname);
  printf("myRank = %d, hostHash = %lx\n", myRank, hostHashs[myRank]);

  MPI_CHECK(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, hostHashs, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
  //if(myRank==0)
  if(1)
  {
    for(int i=0; i<nRanks; i++)
      printf("myRank = %d, hostHash[%d] = %lx\n", myRank, i, hostHashs[i]);
  }

  for (int p=0; p<nRanks; p++) {
     if (p == myRank) break;
     if (hostHashs[p] == hostHashs[myRank]) {
      printf("p=%d\n", p);
      localRank++;//本进程适合持有本地的第几张 gpu 卡
     }
  }

  printf("myRank = %d, localRank-- = %d\n", myRank, localRank);
/* TCP RDMA (Remote Direct Memory Access) GDR (GPU Direct RDMA) 是一种技术,
 * 它允许在使用RDMA的网络上进行高性能的GPU内存之间的直接数据传输。
 * 这种技术可以通过网络直接在GPU之间传输数据,而无需将数据先传输到主机内存。
 * 这有助于减少数据传输的延迟和CPU的参与,从而提高了数据传输的效率。
**/
  ncclUniqueId id;
  ncclComm_t comm;
  float *sendbuff, *recvbuff;
  cudaStream_t s;

  //get NCCL unique ID at rank 0 and broadcast it to all others
  if (myRank == 0)
  {
    cout<<"start:  id is"<<endl;
    for(int i=0; i<128; i++)
    {
      if(id.internal[i]=='\0')break;
        printf("%d",id.internal[i]);
    }
    cout<<"start end"<<endl;
    ncclGetUniqueId(&id);//ncclGetUniqueId 是获得 an Internet socket address, 即,当前机器的ip和port,作为server
    cout<<"  end:  id is "<<endl;
    for(int i=0; i<128; i++)
    {
      if(id.internal[i]=='\0')break;
        printf("%d",id.internal[i]);
    }
    cout<<"end end"<<endl;
#if SOCKET_SIZE
    cout<<"sizeof(sockaddr_in6) = "<<sizeof(sockaddr_in6)<<endl;
#endif
  }

  MPI_CHECK(MPI_Bcast((void *)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD));//将进程0,即root,的 socket 地址,广播给其他进程;
  //printf("LL:: MPI_Bcast()\n");fflush(stdout);

  //picking a GPU based on localRank, allocate device buffers
  CUDA_CHECK(cudaSetDevice(localRank));//每个进程都set一个自己的gpu设备,并从中分配两块显存空间 sendbuff和 recvbuff;
  CUDA_CHECK(cudaMalloc(&sendbuff, size * sizeof(float)));
  CUDA_CHECK(cudaMalloc(&recvbuff, size * sizeof(float)));
  CUDA_CHECK(cudaStreamCreate(&s));//创建本线程自己的stream

  long long  seed = 0;
  get_seed(seed);
//void init_dev_vectors(float A_d, int n, float* B_d, int rank, int seed)
  init_dev_vectors(sendbuff, recvbuff, size, myRank, seed);

  //initializing NCCL
  NCCL_CHECK(ncclCommInitRank(&comm, nRanks, id, myRank));//创建一个新的通信子,多线程多进程场景使用。
  /**********************************************************************************************************
   * ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank)
   * Creates a new communicator (multi thread/process version).
   * rank must be between 0 and nranks-1 and unique within a communicator clique.
   * Each rank is associated to a CUDA device, which has to be set before calling ncclCommInitRank.
   * ncclCommInitRank implicitly synchronizes with other ranks,
   * hence it must be called by different threads/processes or use ncclGroupStart/ncclGroupEnd.
   **********************************************************************************************************/

  //communicating using NCCL
  NCCL_CHECK(ncclAllReduce((const void*)sendbuff, (void*)recvbuff, size, ncclFloat, /*ncclMax*/ ncclSum, comm, s));

  //completing NCCL operation by synchronizing on the CUDA stream
  CUDA_CHECK(cudaStreamSynchronize(s));
  if(myRank == 1)
    fetch_dev_vector(recvbuff, size, myRank);
  //free device buffers
  CUDA_CHECK(cudaFree(sendbuff));
  CUDA_CHECK(cudaFree(recvbuff));

  //finalizing NCCL
  ncclCommDestroy(comm);

  //finalizing MPI
  MPI_CHECK(MPI_Finalize());

  printf("[MPI Rank %d] Success \n", myRank);
  return 0;
}


2,构建

2.1 Makefile


LD_FLAGS := -lnccl -L/usr/local/cuda/lib64 -lcudart -I/usr/local/cuda/include

MPI_FLAGS := -I /usr/lib/x86_64-linux-gnu/openmpi/include -L /usr/lib/x86_64-linux-gnu/openmpi/lib -lmpi -lmpi_cxx

EXE :=  multiProcess_multiDevice_oneServer_allreduce
#singleProcess_multiDevice_oneServer_allreduce
all: $(EXE)

singleProcess_multiDevice_oneServer_allreduce: singleProcess_multiDevice_oneServer_allreduce.cpp
	g++ -g $< -o $@ $(LD_FLAGS)
# singleProcess_multiDevice_oneServer_allreduce

multiProcess_multiDevice_oneServer_allreduce: multiProcess_multiDevice_oneServer_allreduce.cpp
	g++ -g $< -o $@ $(LD_FLAGS) $(MPI_FLAGS)
# ../../ex_openmpi/local/bin/mpirun -np 2 ./oneServer_multiDevice_multiThread

mpi_test: mpi_test.cpp
	g++ -g $< -o $@ $(LD_FLAGS) $(MPI_FLAGS)

.PHONY: clean
clean:
	-rm $(EXE)



2.2 构建

$ make

3,运行

../../ex_openmpi/local/bin/mpirun -np 2 ./multiProcess_multiDevice_oneServer_allreduce

4,效果

allreduce ncclSum的数学效果,每个进程的recvbuff都满足:

 recvbuff[i] = sendbuff_rank0[i] + sendbuff_rank1[i] + ... + sendbuff_rankn-1[i]

  • 6
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是一个简单的MPI程序示例,用于在进程之间发送消息: ```c #include <mpi.h> #include <stdio.h> int main(int argc, char** argv) { int rank, size, tag = 0; char message[100]; MPI_Status status; // 初始化MPI环境 MPI_Init(&argc, &argv); // 获取进程的排名 MPI_Comm_rank(MPI_COMM_WORLD, &rank); // 获取进程数量 MPI_Comm_size(MPI_COMM_WORLD, &size); if (rank == 0) { // 如果进程是0号进程,则发送消息给1号进程 sprintf(message, "Hello, process 1! This is process 0."); MPI_Send(message, strlen(message) + 1, MPI_CHAR, 1, tag, MPI_COMM_WORLD); } else if (rank == 1) { // 如果进程是1号进程,则接收从0号进程发送的消息 MPI_Recv(message, 100, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status); printf("Process 1 received message: %s\n", message); } // 终止MPI环境 MPI_Finalize(); return 0; } ``` 这个程序中,进程0发送了一条消息给进程1,进程1接收到了这条消息并打印出来。MPI_Send和MPI_Recv函数是用于发送和接收消息的,它们有一些参数需要注意: - 发送方使用MPI_Send函数来发送消息,参数包括消息内容、消息长度、接收方进程的排名、消息的标签(用于区分不同类型的消息)以及通信域(可以是MPI_COMM_WORLD,表示所有进程之间的通信)。 - 接收方使用MPI_Recv函数来接收消息,参数包括消息缓冲区、消息缓冲区的长度、发送方进程的排名、消息的标签、通信域以及一个MPI_Status结构体,用于存储接收到的消息的一些元数据,如消息的长度、发送方的排名等。 在使用MPI编程时,需要注意的一个重要问题是消息传递的顺序。MPI_Send和MPI_Recv函数都是阻塞调用,也就是说当一个进程调用MPI_Send函数发送消息时,该函数不会立即返回,直到接收方进程调用MPI_Recv函数接收到该消息为止。因此,在编写MPI程序时,需要保证消息的发送和接收的顺序是正确的,否则可能会出现死锁等问题。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值