多机多卡技术测试-多节点单GPU(CUDA+MPI平方运算)

目录结构
在这里插入图片描述
Makefile

MPICC=/usr/local/mpich/bin/mpicxx
NVCC=/usr/local/cuda-10.2/bin/nvcc

MPI_INCLUDE= -I /usr/local/mpich/include
MPI_LIBS= -L /usr/local/mpich/lib -lmpich

CUDA_INCLUDE= -I /usr/local/cuda-10.2/include
CUDA_LIBS= -L /usr/local/cuda-10.2/lib64 -lcudart 

CFILES=simpleMPI.c
CUFILES=simpleCu.cu
OBJECTS=simpleMPI.o simpleCu.o

all:
	$(MPICC) -c $(CFILES)  -o simpleMPI.o

	$(NVCC)  -c $(CUFILES) -o simpleCu.o 

	$(MPICC)  $(CUDA_LIBS)  $(OBJECTS)  -o simpleMPI 
#	$(NVCC)  $(MPI_LIBS) $(OBJECTS) -o simpleMPI 
run: 
	 mpirun -n  2 ./simpleMPI
clean:
	rm -f simpleMPI *.o

simpleCu.cu

#include <stdio.h>
#include <stdlib.h>
#include "simpleMPI.h"

__global__ void simpleMPIKernel(float *input, float *output)
{
    int tid = blockIdx.x * blockDim.x + threadIdx.x;
    output[tid] = pow(input[tid],2);
}


void initData(float *data, int dataSize)
{
    for (int i = 0; i < dataSize; i++)
    {
        data[i] = (float)(rand()%10+1);
    }
}


void printTotalData(const char *name,float *data, int dataSize)
{  
    printf("%s\n",name);
    for (int i = 0; i < dataSize; i++)
    {
        printf("%10f",data[i]);
    }
	printf("\n");
}

void printNodeData(int commRank,float *data, int dataSize)
{
    printf("进程%d:\n",commRank);
    for (int i = 0; i < dataSize; i++)
    {
        printf("%10f",data[i]);
    }
	printf("\n");
}

void computeGPU(float *hostData, int blockSize, int gridSize)
{
    int dataSize = blockSize * gridSize;

    float *deviceInputData = NULL;
    cudaMalloc((void **)&deviceInputData, dataSize * sizeof(float));

    float *deviceOutputData = NULL;
    cudaMalloc((void **)&deviceOutputData, dataSize * sizeof(float));

    cudaMemcpy(deviceInputData, hostData, dataSize * sizeof(float), cudaMemcpyHostToDevice);

    simpleMPIKernel<<<gridSize, blockSize>>>(deviceInputData, deviceOutputData);

    cudaMemcpy(hostData, deviceOutputData, dataSize *sizeof(float), cudaMemcpyDeviceToHost);

    cudaFree(deviceInputData);
    cudaFree(deviceOutputData);
}

simpleMPI.c

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#include "simpleMPI.h"

int main(int argc, char *argv[])
{
	int blockSize = 5;
	int gridSize = 1;
	int dataSizePerNode = gridSize * blockSize;

	MPI_Init(&argc, &argv);

	int commSize, commRank;
	MPI_Comm_size(MPI_COMM_WORLD, &commSize);
	MPI_Comm_rank(MPI_COMM_WORLD, &commRank);
	
	int dataSizeTotal = dataSizePerNode * commSize;
	float *dataRoot = NULL;

	if (commRank == 0)  
	{
		printf("Running on %d nodes\n", commSize);
		dataRoot =(float*)malloc(dataSizeTotal*sizeof(float));
		initData(dataRoot, dataSizeTotal);
		printTotalData("Initial:",dataRoot,dataSizeTotal);	
	}

	float *dataNode =(float*)malloc(dataSizePerNode*sizeof(float));
	MPI_Scatter(dataRoot,dataSizePerNode,MPI_FLOAT,dataNode,dataSizePerNode,MPI_FLOAT,0,MPI_COMM_WORLD);
	computeGPU(dataNode, blockSize, gridSize);
	printNodeData(commRank,dataNode,dataSizePerNode);
	MPI_Gather(dataNode, dataSizePerNode, MPI_FLOAT, dataRoot, dataSizePerNode,MPI_FLOAT,0, MPI_COMM_WORLD);

	if (commRank == 0)
	{
	printTotalData("Result:",dataRoot,dataSizeTotal);	
	free(dataRoot);
	}

	free(dataNode);

	if (commRank == 0)
	{
		printf("PASSED\n");
	}

	MPI_Finalize();

	return 0;

}

simpleMPI.h

#ifndef _SIMPLEMPI_H
#define _SIMPLEMPI_H

void initData(float *data, int dataSize);
void printTotalData(const char *name,float *data, int dataSize);
void printNodeData(int commRank,float *data, int dataSize);
void computeGPU(float *hostData, int blockSize, int gridSize);

#endif

运行结果:
在这里插入图片描述

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值