#!/bin/bash
MPI_VERSION="3.0.4"
wget -O /tmp/openmpi-${MPI_VERSION}.tar.gz https://www.open-mpi.org/software/ompi/v3.0/downloads/openmpi-${MPI_VERSION}.tar.gz
tar xzf /tmp/openmpi-${MPI_VERSION}.tar.gz -C /tmp
cd /tmp/openmpi-${MPI_VERSION}
./configure --enable-orterun-prefix-by-default
make -j $(nproc) all && sudo make install
sudo ldconfig
mpirun --version
配置好环境。
#include <cstdio>
#include <omp.h>
#include <mpi.h>
#include <helper_timer.h>
using namespace std;
__global__ void vecAdd_kernel(float *c, const float* a, const float* b);
void init_buffer(float *data, const int size);
class Operator
{
private:
int _index;
cudaStream_t stream;
StopWatchInterface *p_timer;
static void CUDART_CB Callback(cudaStream_t stream, cudaError_t status, void* userData);
void print_time();
public:
Operator() {
cudaStreamCreate(&stream);
sdkCreateTimer(&p_timer);
}
~Operator() {
cudaStreamDestroy(stream);
sdkDeleteTimer(&p_timer);
}
void set_index(int idx) { _index = idx; }
void async_operation(float *h_c, const float *h_a, const float *h_b,
float *d_c, float *d_a, float *d_b,
const int size, const int bufsize);
}; // Operator
void Operator::CUDART_CB Callback(cudaStream_t stream, cudaError_t status, void* userData) {
Operator* this_ = (Operator*) userData;
this_->print_time();
}
void Operator::print_time() {
sdkStopTimer(&p_timer); // end timer
float elapsed_time_msed = sdkGetTimerValue(&p_timer);
printf("stream %2d - elapsed %.3f ms \n", _index, elapsed_time_msed);
}
void Operator::async_operation(float *h_c, const float *h_a, const float *h_b,
float *d_c, float *d_a, float *d_b,
const int size, const int bufsize)
{
// start timer
sdkStartTimer(&p_timer);
// copy host -> device
cudaMemcpyAsync(d_a, h_a, bufsize, cudaMemcpyHostToDevice, stream);
cudaMemcpyAsync(d_b, h_b, bufsize, cudaMemcpyHostToDevice, stream);
// launch cuda kernel
dim3 dimBlock(256);
dim3 dimGrid(size / dimBlock.x);
vecAdd_kernel<<< dimGrid, dimBlock, 0, stream >>>(d_c, d_a, d_b);
// copy device -> host
cudaMemcpyAsync(h_c, d_c, bufsize, cudaMemcpyDeviceToHost, stream);
// register callback function
cudaStreamAddCallback(stream, Operator::Callback, this, 0);
}
int main(int argc, char *argv[])
{
float *h_a, *h_b, *h_c;
float *d_a, *d_b, *d_c;
int size = 1 << 24;
int bufsize = size * sizeof(float);
int num_operator = 4;
if (argc != 1)
num_operator = atoi(argv[1]);
// initialize timer
StopWatchInterface *timer;
sdkCreateTimer(&timer);
// set num_operator as the number of requested process
int np, rank;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &np);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
if (rank == 0)
printf("Number of process: %d\n", np);
bufsize /= np;
size /= np;
// allocate unified memory
cudaMallocHost((void**)&h_a, bufsize);
cudaMallocHost((void**)&h_b, bufsize);
cudaMallocHost((void**)&h_c, bufsize);
// initialize host values
srand(2019);
init_buffer(h_a, size);
init_buffer(h_b, size);
init_buffer(h_c, size);
// allocate device memories
cudaMalloc((void**)&d_a, bufsize);
cudaMalloc((void**)&d_b, bufsize);
cudaMalloc((void**)&d_c, bufsize);
printf("Number of operations: %d\n", num_operator);
// create list of operation elements
Operator *ls_operator = new Operator[num_operator];
sdkStartTimer(&timer);
// execute each operator collesponding data
omp_set_num_threads(num_operator);
#pragma omp parallel
{
int i = omp_get_thread_num();
int offset = i * size / num_operator;
printf("Launched GPU task (%d, %d)\n", rank, i);
ls_operator[i].set_index(i);
ls_operator[i].async_operation(&h_c[offset], &h_a[offset], &h_b[offset],
&d_c[offset], &d_a[offset], &d_b[offset],
size / num_operator, bufsize / num_operator);
}
cudaDeviceSynchronize();
sdkStopTimer(&timer);
// print out the result
int print_idx = 256;
printf("compared a sample result...\n");
printf("host: %.6f, device: %.6f\n", h_a[print_idx] + h_b[print_idx], h_c[print_idx]);
// Compute and print the performance
float elapsed_time_msed = sdkGetTimerValue(&timer);
float bandwidth = 3 * bufsize * sizeof(float) / elapsed_time_msed / 1e6;
printf("Time= %.3f msec, bandwidth= %f GB/s\n", elapsed_time_msed, bandwidth);
sdkDeleteTimer(&timer);
// terminate operators
delete [] ls_operator;
// terminate device memories
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
// terminate host memories
cudaFreeHost(h_a);
cudaFreeHost(h_b);
cudaFreeHost(h_c);
MPI_Finalize();
return 0;
}
void init_buffer(float *data, const int size)
{
for (int i = 0; i < size; i++)
data[i] = rand() / (float)RAND_MAX;
}
__global__ void vecAdd_kernel(float *c, const float* a, const float* b)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
for (int i = 0; i < 50000; i++)
c[idx] = a[idx] + b[idx];
}
上面是实现的代码,可以保存为cu格式。
CUDA_PATH=/usr/local/cuda
HOST_COMPILER ?= g++
NVCC=${CUDA_PATH}/bin/nvcc -ccbin ${HOST_COMPILER}
MPICC ?= mpicc
TARGET=simpleMPI
INCLUDES = -I${CUDA_PATH}/samples/common/inc -I/usr/local/include/
NVCC_FLAGS=-m64 -Xcompiler -fopenmp -rdc=true -lcudadevrt -lmpi # --resource-usage
IS_CUDA_11:=${shell expr `$(NVCC) --version | grep compilation | grep -Eo -m 1 '[0-9]+.[0-9]' | head -1` \>= 11.0}
# Gencode argumentes
SMS = 35 37 50 52 60 61 70 75
ifeq "$(IS_CUDA_11)" "1"
SMS = 52 60 61 70 75 80 86
endif
$(foreach sm, ${SMS}, $(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
LIBRARIES += -lgomp
ALL_CCFLAGS += ${NVCC_FLAGS}
all : ${TARGET}
simpleMPI: simpleMPI.cu
$(EXEC) $(NVCC) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES) $(INCLUDES)
enable_mps:
export CUDA_VISIBLE_DEVICES=0
sudo nvidia-smi -c 3 -i 0
sudo nvidia-cuda-mps-control -d
disable_mps:
echo "quit" | sudo nvidia-cuda-mps-control
sudo nvidia-smi -c 0 -i 0
nvprof: simpleMPI
mpirun -np ${PROCS} nvprof -f -o $+.%q{OMPI_COMM_WORLD_RANK}_${STREAMS}.nvvp ./$+ ${STREAMS}
clean:
rm -f ${TARGET} *.o *.nvvp
makefile。
直接就可以编译:
make
./simpleMPI 4