矩阵乘法-CUDA+MPI（单个文件）

最新推荐文章于 2023-04-12 09:39:53 发布

Rocky_myx

最新推荐文章于 2023-04-12 09:39:53 发布

阅读量731

点赞数 1

分类专栏： # 矩阵乘法文章标签： cuda 并行计算 c语言

本文链接：https://blog.csdn.net/qq_33562636/article/details/108714011

版权

矩阵乘法专栏收录该内容

7 篇文章 0 订阅

订阅专栏

Makefile

CC = /usr/bin/gcc
NVCC = /usr/local/cuda-10.2/bin/nvcc
CFLAGS = -fopenmp -g -w -O4 -I..
MPI = -I /usr/local/mpich/include 
LIB = -L /usr/local/mpich/lib -lmpich -lopa -lmpl -lpthread  

all: 
	${NVCC}  ${MPI} ${LIB} matvec.cu -o newfloatmatvec
run:
	./newfloatmatvec 4 4 4 4 1 -v -p
clean:
	-rm newfloat*

Matrix.cu

#include <stdio.h>
#include <cuda.h>
#include "mpi.h"

//------------------------------------------------------------------------------------------------------------------------------------------

#define BLOCKSIZE 2

//--------------------------------------------------------------------------------------------------------------------------------------------

int CheckDevice(int);

//--------------------------------------------------------------------------------------------------------------------------------------------

//Pragma routine to report the detail of cuda error
#define CUDA_SAFE_CALL(call)                                                         \
            do{                                                                      \
                 cudaError_t err = call;                                             \
                 if(err != cudaSuccess)                                              \
                 {                                                                   \
                        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                         __FILE__, __LINE__, cudaGetErrorString( err) );             \
                         exit(1);                                                    \
                 }                                                                   \
               } while (0)                                                           \

//----------------------------------------------------------------------------------------------------------------------------------------

void initial(int *matrix, int size)
{
	for (int i = 0; i < size; i++)
	{
		matrix[i] = (int)(rand() % 10 + 1);
	}
}

void printMatrix(int *matrix, int row, int col)
{
	int *p = matrix;
	for (int y = 0; y < row; y++)
	{
		for (int x = 0; x < col; x++)
		{
			printf("%5d", p[x]);
		}
		p = p + col;
		printf("\n");
	}
	
}



//Kernel that performs Matrix Vector Multiplication
__global__ void MatrixVectorMultiplication(int *Matrix, int *Vector, int *Solution, int A_row, int A_col, int B_row, int B_col, int VectorLength, int ScatterSize, int ThreadDim, int MyRank, int NumberofProcessors)
{
	/*
	int tidx = threadIdx.x;
	for (int i = 0; i < RowsNo / NumberofProcessors; i++) {
		for (tidx = 0; tidx < ColsNo2; tidx++) {
			int sum = 0.0;
			for (int k = 0; k < RowsNo2; k++)
				sum = sum + Matrix[i * ColsNo + k] * Vector[k * ColsNo2 + tidx];

			Solution[i * ColsNo2 + tidx] = sum;
		}

	}
	*/

	int ix = threadIdx.x + blockDim.x*blockIdx.x;//col  number
	int iy = threadIdx.y + blockDim.y*blockIdx.y;//row number

	if (ix < A_col && iy < A_row)
	{
		double sum = 0;
		for (int k = 0; k < B_row; k++)
		{
			sum = sum + Matrix[iy * A_col + k] * Vector[k * B_col + ix];
		}
		Solution[iy * B_col + ix] = sum;
	}

	__syncthreads();
}//End of Matrix Vector Multiplication Device Function
//---------------------------------------------------------------------------------------------------------------------------------------



int main(int argc, char **argv)
{
	int MyRank, NumberOfProcessors;
	int Root = 0, Index;
	volatile int Status = 1;
	int *MatrixA=0, *VectorB, *ResultVector=0, *MatrixB=0;
	int *MyMatrixA, *MyResultMatrix;
	int *DeviceMyMatrixA, *DeviceMyResultVector, *DeviceMatrixB, *CPUResultVector;
	int RowsNo, ColsNo, RowsNo2, ColsNo2, ScatterSize, IndexCol, IndexValue, DeviceStatus;
	volatile int resultsize,matrixbsize, pinned;
	int print = 0;
	int verify = 0;


	//MPI Intialization
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
	MPI_Comm_size(MPI_COMM_WORLD, &NumberOfProcessors);



	//Checking if valid number of arguements have been passed
	if (argc < 6)
	{
		if (MyRank == Root)
			printf("Usage:< mpirun >< -n >< Number of processors >< ./Program Name >< Number of Rows of Matri x>< Number of Columns of Matrix >< Rows of Matrix 2 > <Coloumns of Matrix 1>  <1 for pinned memory, 2 for unpinnned>  <-v if verification is required>  <-p if print is required>\n");
		MPI_Finalize();
		exit(-1);
	}
	if ((argc >= 7 && strcmp(argv[6], "-v") == 0)) {
		verify = 1;
	}

	if ((argc == 8 && strcmp(argv[7], "-p") == 0) || (argc == 7 && strcmp(argv[6], "-p") == 0)) {
		print = 1;
	}


	//Assigning values to RowsNo, ColsNo, VectorSize from the arguements passed
	RowsNo = atoi(argv[1]);
	ColsNo = atoi(argv[2]);
	RowsNo2 = atoi(argv[3]);
	ColsNo2 = atoi(argv[4]);
	pinned = atoi(argv[5]);

	resultsize = RowsNo * ColsNo2;
	matrixbsize = RowsNo2 * ColsNo2;
	if (MyRank == 0)
		printf("Resultant Matrix Number of Elements is %d\n\n\n", matrixbsize);


	int elements;

	//Checking if columns is equal to vector size
	if (ColsNo != RowsNo2)
	{
		if (MyRank == Root)
			printf("Entered wrong input, Number of columns of matrix should be equal to number of rows \n");
		MPI_Finalize();
		exit(-1);
	}

	if (RowsNo < NumberOfProcessors)
	{
		if (MyRank == Root)
			printf("Given number of Rows of the matrix should be more than number of processors \n");
		MPI_Finalize();
		exit(-1);
	}

	//Checking if Matrix can be distributed evenly to all the nodes
	if (RowsNo % NumberOfProcessors != 0)
	{
		if (MyRank == Root)
			printf("The Rows of the matrix can not be distributed evenly among processors \n");
		MPI_Finalize();
		exit(-1);
	}

	//Root node intializes the Matrix, Vector and Result Vector
	if (MyRank == Root)
	{
	MatrixA = (int *)malloc(RowsNo * ColsNo * sizeof(int));
	MatrixB = (int *)malloc(RowsNo2 * ColsNo2 * sizeof(int));
	ResultVector = (int *)malloc(RowsNo * ColsNo2* sizeof(int));
	
	initial(MatrixA,RowsNo * ColsNo);
	initial(MatrixB,RowsNo2 * ColsNo2);

	printf("Matrix A initialized\n");
                printMatrix(MatrixA,RowsNo,ColsNo);
	printf("Matrix B initilized\n");
	printMatrix(MatrixB,RowsNo2,ColsNo2);

                 }
		
	MPI_Barrier(MPI_COMM_WORLD);
//	MPI_Bcast(Status, 1, MPI_INT, Root, MPI_COMM_WORLD);

	//Allocating memory for the Vector by all nodes expect root node
	if (MyRank != Root)
		MatrixB = (int *)malloc(matrixbsize * sizeof(int));

	//Broad casting the Vector to all the nodes from root node
	MPI_Bcast(MatrixB, matrixbsize, MPI_INT, Root, MPI_COMM_WORLD);

	//Calculating the Scatter size of the Matrix
	ScatterSize = RowsNo / NumberOfProcessors;

	elements = (RowsNo*ColsNo2) / NumberOfProcessors;

	//Allocating the memory on the host for the MyMatrixA and MyResultVector by all nodes
	MyMatrixA = (int *)malloc(ScatterSize * ColsNo * sizeof(int));
	if (MyMatrixA == NULL)
		Status = 0;

	MyResultMatrix = (int *)malloc(elements * sizeof(int));
	if (MyResultMatrix == NULL)
		Status = 0;

	//Distributing the Matrix among to all the nodes
	MPI_Scatter(MatrixA, ScatterSize * ColsNo, MPI_INT, MyMatrixA, ScatterSize * ColsNo, MPI_INT, Root, MPI_COMM_WORLD);

	DeviceStatus = CheckDevice(MyRank);

	if (DeviceStatus == 0)
	{
		printf("Processor with rank %d doing partial product of two vectors on CPU \n", MyRank);
		for (Index = 0; Index < ScatterSize; Index++)
		{
			MyResultMatrix[Index] = 0;
			IndexValue = Index * ColsNo;
			for (IndexCol = 0; IndexCol < ColsNo; IndexCol++)
				MyResultMatrix[Index] += (MyMatrixA[IndexValue++] * VectorB[IndexCol]);
		}
	}else{

		if (pinned == 1) {
			cudaSetDevice(MyRank);
			printf("Pinned mode\n");
			//Allocating the Memory on the device memory
			CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaHostAllocDefault));
			CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMatrixB, matrixbsize * sizeof(int), cudaHostAllocDefault));
			CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMyResultVector, elements * sizeof(int), cudaHostAllocDefault));

			//Copying the data from host to device
			cudaMemcpyAsync((void *)DeviceMyMatrixA, (void *)MyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaMemcpyHostToDevice);
			cudaMemcpyAsync((void *)DeviceMatrixB, (void *)MatrixB, matrixbsize * sizeof(int), cudaMemcpyHostToDevice);
		}

		else {

			cudaSetDevice(MyRank);
			printf("Unpinned mode\n");
			//Allocating the Memory on the device memory
			CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMyMatrixA, ScatterSize * ColsNo * sizeof(int)));
			CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMatrixB, matrixbsize * sizeof(int)));
			CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMyResultVector, elements * sizeof(int)));

			//Copying the data from host to device
			cudaMemcpy((void *)DeviceMyMatrixA, (void *)MyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaMemcpyHostToDevice);
			cudaMemcpy((void *)DeviceMatrixB, (void *)MatrixB, matrixbsize * sizeof(int), cudaMemcpyHostToDevice);

		}


		cudaSetDevice(MyRank);
		//Calling the kernel which performs Matrix Vector Product
		int dimx = ScatterSize;
		int dimy = ScatterSize;
		dim3 block(dimx, dimy);
		dim3 grid((ColsNo2 + block.x - 1) / block.x, (ScatterSize + block.y - 1) / block.y);
//	dim3 grid((ScatterSize + block.x - 1) / block.x, (ColsNo2 + block.y - 1) / block.y);
		MatrixVectorMultiplication << <grid, block >> > (DeviceMyMatrixA, DeviceMatrixB, DeviceMyResultVector, RowsNo, ColsNo, RowsNo2, ColsNo2, ColsNo, ScatterSize, BLOCKSIZE, MyRank, NumberOfProcessors);	
		printf("Matrix ResultMatrix：（%d×%d) <<<(%d,%d),(%d,%d)>>> \n",RowsNo,ColsNo2,grid.x,grid.y,block.x,block.y);
	//	MatrixVectorMultiplication << <1, 256 >> > (DeviceMyMatrixA, DeviceMatrixB, DeviceMyResultVector, RowsNo, ColsNo, RowsNo2, ColsNo2, ColsNo, ScatterSize, BLOCKSIZE, MyRank, NumberOfProcessors);
		//Copying the value of patial result vector from device to host
		cudaMemcpy((void *)MyResultMatrix, (void *)DeviceMyResultVector, elements * sizeof(int), cudaMemcpyDeviceToHost);
	}

	MPI_Barrier(MPI_COMM_WORLD);

	//Root processor gathering from all nodes to get the final result vector
	MPI_Gather(MyResultMatrix, elements, MPI_INT, ResultVector, elements, MPI_INT, Root, MPI_COMM_WORLD);

	//To verify:

	//Compute on CPU


	if (MyRank == 0 && verify == 1) {
		printf("\n\n\nVerification Done\n\n");
		CPUResultVector = (int *)malloc(RowsNo * ColsNo2 * sizeof(int));
		for (int i = 0; i < RowsNo; i++) {
			for (int j = 0; j < ColsNo2; j++) {
				int sum = 0.0;
				for (int k = 0; k < RowsNo2; k++)
					sum = sum + MatrixA[i * ColsNo + k] * MatrixB[k * ColsNo2 + j];

				CPUResultVector[i * ColsNo2 + j] = sum;

			}
		}

		for (Index = 0; Index < ColsNo2 * RowsNo; Index++)

		{
			int a = ResultVector[Index];
			int b = CPUResultVector[Index];
			if (a != b)
			{
				printf("Error in computation and values are %f and %f", ResultVector[Index], CPUResultVector[Index]);
			}
		}

	}


	//Root processor printing the resultant vector if print specified
	if (MyRank == Root && print == 1)
	{
		printMatrix(ResultVector,RowsNo,ColsNo2);
		//freeing the Vectors allocated by the root node
		free(MatrixA);
		free(MatrixB);
		free(ResultVector);
	}

	if (MyRank == 0)
		printf("\n\n Computation Done .....\n Exiting \n\n");

	//Freeing the host memory	
	free(MyMatrixA);
	free(MatrixB);
	free(MyResultMatrix);

	/*//Freeing the device memory
	CUDA_SAFE_CALL( cudaFree( DeviceMyMatrixA ) );
	CUDA_SAFE_CALL( cudaFree( DeviceMatrixB ) );
	CUDA_SAFE_CALL( cudaFree( DeviceMyResultVector ) );*/

	MPI_Finalize();

	return(0);
}
//End of Main function
//---------------------------------------------------------------------------------------------------------------------------------------

int CheckDevice(int MyRank)
{
	int DeviceCount, Device;
	struct cudaDeviceProp Properties;

	cudaGetDeviceCount(&DeviceCount);
	if (DeviceCount >= 1)
	{
		cudaGetDevice(&Device);
		cudaGetDeviceProperties(&Properties, Device);
		printf("Processor with  rank %d has the Device by name %s and computation is done on this device \n", MyRank, Properties.name);
	}

	return(DeviceCount);

}//End of function
//--------------------------------------------------------------------------------------------------------------

Rocky_myx

关注

1
点赞
踩
10

收藏

觉得还不错? 一键收藏
0
评论
矩阵乘法-CUDA+MPI（单个文件）

MakefileCC = /usr/bin/gccNVCC = /usr/local/cuda-10.2/bin/nvccCFLAGS = -fopenmp -g -w -O4 -I..MPI = -I /usr/local/mpich/include LIB = -L /usr/local/mpich/lib -lmpich -lopa -lmpl -lpthread all: ${NVCC} ${MPI} ${LIB} matvec.cu -o newfloatmatvecrun
复制链接

扫一扫