矩阵乘法-CUDA+MPI(单个文件)

Makefile

CC = /usr/bin/gcc
NVCC = /usr/local/cuda-10.2/bin/nvcc
CFLAGS = -fopenmp -g -w -O4 -I..
MPI = -I /usr/local/mpich/include 
LIB = -L /usr/local/mpich/lib -lmpich -lopa -lmpl -lpthread  

all: 
	${NVCC}  ${MPI} ${LIB} matvec.cu -o newfloatmatvec
run:
	./newfloatmatvec 4 4 4 4 1 -v -p
clean:
	-rm newfloat*

Matrix.cu

#include <stdio.h>
#include <cuda.h>
#include "mpi.h"

//------------------------------------------------------------------------------------------------------------------------------------------

#define BLOCKSIZE 2

//--------------------------------------------------------------------------------------------------------------------------------------------

int CheckDevice(int);

//--------------------------------------------------------------------------------------------------------------------------------------------

//Pragma routine to report the detail of cuda error
#define CUDA_SAFE_CALL(call)                                                         \
            do{                                                                      \
                 cudaError_t err = call;                                             \
                 if(err != cudaSuccess)                                              \
                 {                                                                   \
                        fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",\
                         __FILE__, __LINE__, cudaGetErrorString( err) );             \
                         exit(1);                                                    \
                 }                                                                   \
               } while (0)                                                           \

//----------------------------------------------------------------------------------------------------------------------------------------

void initial(int *matrix, int size)
{
	for (int i = 0; i < size; i++)
	{
		matrix[i] = (int)(rand() % 10 + 1);
	}
}

void printMatrix(int *matrix, int row, int col)
{
	int *p = matrix;
	for (int y = 0; y < row; y++)
	{
		for (int x = 0; x < col; x++)
		{
			printf("%5d", p[x]);
		}
		p = p + col;
		printf("\n");
	}
	
}



//Kernel that performs Matrix Vector Multiplication
__global__ void MatrixVectorMultiplication(int *Matrix, int *Vector, int *Solution, int A_row, int A_col, int B_row, int B_col, int VectorLength, int ScatterSize, int ThreadDim, int MyRank, int NumberofProcessors)
{
	/*
	int tidx = threadIdx.x;
	for (int i = 0; i < RowsNo / NumberofProcessors; i++) {
		for (tidx = 0; tidx < ColsNo2; tidx++) {
			int sum = 0.0;
			for (int k = 0; k < RowsNo2; k++)
				sum = sum + Matrix[i * ColsNo + k] * Vector[k * ColsNo2 + tidx];

			Solution[i * ColsNo2 + tidx] = sum;
		}

	}
	*/

	int ix = threadIdx.x + blockDim.x*blockIdx.x;//col  number
	int iy = threadIdx.y + blockDim.y*blockIdx.y;//row number

	if (ix < A_col && iy < A_row)
	{
		double sum = 0;
		for (int k = 0; k < B_row; k++)
		{
			sum = sum + Matrix[iy * A_col + k] * Vector[k * B_col + ix];
		}
		Solution[iy * B_col + ix] = sum;
	}

	__syncthreads();
}//End of Matrix Vector Multiplication Device Function
//---------------------------------------------------------------------------------------------------------------------------------------



int main(int argc, char **argv)
{
	int MyRank, NumberOfProcessors;
	int Root = 0, Index;
	volatile int Status = 1;
	int *MatrixA=0, *VectorB, *ResultVector=0, *MatrixB=0;
	int *MyMatrixA, *MyResultMatrix;
	int *DeviceMyMatrixA, *DeviceMyResultVector, *DeviceMatrixB, *CPUResultVector;
	int RowsNo, ColsNo, RowsNo2, ColsNo2, ScatterSize, IndexCol, IndexValue, DeviceStatus;
	volatile int resultsize,matrixbsize, pinned;
	int print = 0;
	int verify = 0;


	//MPI Intialization
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
	MPI_Comm_size(MPI_COMM_WORLD, &NumberOfProcessors);



	//Checking if valid number of arguements have been passed
	if (argc < 6)
	{
		if (MyRank == Root)
			printf("Usage:< mpirun >< -n >< Number of processors >< ./Program Name >< Number of Rows of Matri x>< Number of Columns of Matrix >< Rows of Matrix 2 > <Coloumns of Matrix 1>  <1 for pinned memory, 2 for unpinnned>  <-v if verification is required>  <-p if print is required>\n");
		MPI_Finalize();
		exit(-1);
	}
	if ((argc >= 7 && strcmp(argv[6], "-v") == 0)) {
		verify = 1;
	}

	if ((argc == 8 && strcmp(argv[7], "-p") == 0) || (argc == 7 && strcmp(argv[6], "-p") == 0)) {
		print = 1;
	}


	//Assigning values to RowsNo, ColsNo, VectorSize from the arguements passed
	RowsNo = atoi(argv[1]);
	ColsNo = atoi(argv[2]);
	RowsNo2 = atoi(argv[3]);
	ColsNo2 = atoi(argv[4]);
	pinned = atoi(argv[5]);

	resultsize = RowsNo * ColsNo2;
	matrixbsize = RowsNo2 * ColsNo2;
	if (MyRank == 0)
		printf("Resultant Matrix Number of Elements is %d\n\n\n", matrixbsize);


	int elements;

	//Checking if columns is equal to vector size
	if (ColsNo != RowsNo2)
	{
		if (MyRank == Root)
			printf("Entered wrong input, Number of columns of matrix should be equal to number of rows \n");
		MPI_Finalize();
		exit(-1);
	}

	if (RowsNo < NumberOfProcessors)
	{
		if (MyRank == Root)
			printf("Given number of Rows of the matrix should be more than number of processors \n");
		MPI_Finalize();
		exit(-1);
	}

	//Checking if Matrix can be distributed evenly to all the nodes
	if (RowsNo % NumberOfProcessors != 0)
	{
		if (MyRank == Root)
			printf("The Rows of the matrix can not be distributed evenly among processors \n");
		MPI_Finalize();
		exit(-1);
	}

	//Root node intializes the Matrix, Vector and Result Vector
	if (MyRank == Root)
	{
	MatrixA = (int *)malloc(RowsNo * ColsNo * sizeof(int));
	MatrixB = (int *)malloc(RowsNo2 * ColsNo2 * sizeof(int));
	ResultVector = (int *)malloc(RowsNo * ColsNo2* sizeof(int));
	
	initial(MatrixA,RowsNo * ColsNo);
	initial(MatrixB,RowsNo2 * ColsNo2);

	printf("Matrix A initialized\n");
                printMatrix(MatrixA,RowsNo,ColsNo);
	printf("Matrix B initilized\n");
	printMatrix(MatrixB,RowsNo2,ColsNo2);

                 }
		
	MPI_Barrier(MPI_COMM_WORLD);
//	MPI_Bcast(Status, 1, MPI_INT, Root, MPI_COMM_WORLD);

	//Allocating memory for the Vector by all nodes expect root node
	if (MyRank != Root)
		MatrixB = (int *)malloc(matrixbsize * sizeof(int));

	//Broad casting the Vector to all the nodes from root node
	MPI_Bcast(MatrixB, matrixbsize, MPI_INT, Root, MPI_COMM_WORLD);

	//Calculating the Scatter size of the Matrix
	ScatterSize = RowsNo / NumberOfProcessors;

	elements = (RowsNo*ColsNo2) / NumberOfProcessors;

	//Allocating the memory on the host for the MyMatrixA and MyResultVector by all nodes
	MyMatrixA = (int *)malloc(ScatterSize * ColsNo * sizeof(int));
	if (MyMatrixA == NULL)
		Status = 0;

	MyResultMatrix = (int *)malloc(elements * sizeof(int));
	if (MyResultMatrix == NULL)
		Status = 0;

	//Distributing the Matrix among to all the nodes
	MPI_Scatter(MatrixA, ScatterSize * ColsNo, MPI_INT, MyMatrixA, ScatterSize * ColsNo, MPI_INT, Root, MPI_COMM_WORLD);

	DeviceStatus = CheckDevice(MyRank);

	if (DeviceStatus == 0)
	{
		printf("Processor with rank %d doing partial product of two vectors on CPU \n", MyRank);
		for (Index = 0; Index < ScatterSize; Index++)
		{
			MyResultMatrix[Index] = 0;
			IndexValue = Index * ColsNo;
			for (IndexCol = 0; IndexCol < ColsNo; IndexCol++)
				MyResultMatrix[Index] += (MyMatrixA[IndexValue++] * VectorB[IndexCol]);
		}
	}else{

		if (pinned == 1) {
			cudaSetDevice(MyRank);
			printf("Pinned mode\n");
			//Allocating the Memory on the device memory
			CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaHostAllocDefault));
			CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMatrixB, matrixbsize * sizeof(int), cudaHostAllocDefault));
			CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMyResultVector, elements * sizeof(int), cudaHostAllocDefault));

			//Copying the data from host to device
			cudaMemcpyAsync((void *)DeviceMyMatrixA, (void *)MyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaMemcpyHostToDevice);
			cudaMemcpyAsync((void *)DeviceMatrixB, (void *)MatrixB, matrixbsize * sizeof(int), cudaMemcpyHostToDevice);
		}

		else {

			cudaSetDevice(MyRank);
			printf("Unpinned mode\n");
			//Allocating the Memory on the device memory
			CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMyMatrixA, ScatterSize * ColsNo * sizeof(int)));
			CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMatrixB, matrixbsize * sizeof(int)));
			CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMyResultVector, elements * sizeof(int)));

			//Copying the data from host to device
			cudaMemcpy((void *)DeviceMyMatrixA, (void *)MyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaMemcpyHostToDevice);
			cudaMemcpy((void *)DeviceMatrixB, (void *)MatrixB, matrixbsize * sizeof(int), cudaMemcpyHostToDevice);

		}


		cudaSetDevice(MyRank);
		//Calling the kernel which performs Matrix Vector Product
		int dimx = ScatterSize;
		int dimy = ScatterSize;
		dim3 block(dimx, dimy);
		dim3 grid((ColsNo2 + block.x - 1) / block.x, (ScatterSize + block.y - 1) / block.y);
//	dim3 grid((ScatterSize + block.x - 1) / block.x, (ColsNo2 + block.y - 1) / block.y);
		MatrixVectorMultiplication << <grid, block >> > (DeviceMyMatrixA, DeviceMatrixB, DeviceMyResultVector, RowsNo, ColsNo, RowsNo2, ColsNo2, ColsNo, ScatterSize, BLOCKSIZE, MyRank, NumberOfProcessors);	
		printf("Matrix ResultMatrix:(%d×%d) <<<(%d,%d),(%d,%d)>>> \n",RowsNo,ColsNo2,grid.x,grid.y,block.x,block.y);
	//	MatrixVectorMultiplication << <1, 256 >> > (DeviceMyMatrixA, DeviceMatrixB, DeviceMyResultVector, RowsNo, ColsNo, RowsNo2, ColsNo2, ColsNo, ScatterSize, BLOCKSIZE, MyRank, NumberOfProcessors);
		//Copying the value of patial result vector from device to host
		cudaMemcpy((void *)MyResultMatrix, (void *)DeviceMyResultVector, elements * sizeof(int), cudaMemcpyDeviceToHost);
	}

	MPI_Barrier(MPI_COMM_WORLD);

	//Root processor gathering from all nodes to get the final result vector
	MPI_Gather(MyResultMatrix, elements, MPI_INT, ResultVector, elements, MPI_INT, Root, MPI_COMM_WORLD);

	//To verify:

	//Compute on CPU


	if (MyRank == 0 && verify == 1) {
		printf("\n\n\nVerification Done\n\n");
		CPUResultVector = (int *)malloc(RowsNo * ColsNo2 * sizeof(int));
		for (int i = 0; i < RowsNo; i++) {
			for (int j = 0; j < ColsNo2; j++) {
				int sum = 0.0;
				for (int k = 0; k < RowsNo2; k++)
					sum = sum + MatrixA[i * ColsNo + k] * MatrixB[k * ColsNo2 + j];

				CPUResultVector[i * ColsNo2 + j] = sum;

			}
		}

		for (Index = 0; Index < ColsNo2 * RowsNo; Index++)

		{
			int a = ResultVector[Index];
			int b = CPUResultVector[Index];
			if (a != b)
			{
				printf("Error in computation and values are %f and %f", ResultVector[Index], CPUResultVector[Index]);
			}
		}

	}


	//Root processor printing the resultant vector if print specified
	if (MyRank == Root && print == 1)
	{
		printMatrix(ResultVector,RowsNo,ColsNo2);
		//freeing the Vectors allocated by the root node
		free(MatrixA);
		free(MatrixB);
		free(ResultVector);
	}

	if (MyRank == 0)
		printf("\n\n Computation Done .....\n Exiting \n\n");

	//Freeing the host memory	
	free(MyMatrixA);
	free(MatrixB);
	free(MyResultMatrix);

	/*//Freeing the device memory
	CUDA_SAFE_CALL( cudaFree( DeviceMyMatrixA ) );
	CUDA_SAFE_CALL( cudaFree( DeviceMatrixB ) );
	CUDA_SAFE_CALL( cudaFree( DeviceMyResultVector ) );*/

	MPI_Finalize();

	return(0);
}
//End of Main function
//---------------------------------------------------------------------------------------------------------------------------------------

int CheckDevice(int MyRank)
{
	int DeviceCount, Device;
	struct cudaDeviceProp Properties;

	cudaGetDeviceCount(&DeviceCount);
	if (DeviceCount >= 1)
	{
		cudaGetDevice(&Device);
		cudaGetDeviceProperties(&Properties, Device);
		printf("Processor with  rank %d has the Device by name %s and computation is done on this device \n", MyRank, Properties.name);
	}

	return(DeviceCount);

}//End of function
//--------------------------------------------------------------------------------------------------------------






  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
下面是一个基于MPI的矩阵-向量乘法实现的示例: ```c #include <stdio.h> #include <mpi.h> #define N 4 int main(int argc, char** argv) { int rank, size; int A[N][N], x[N], y[N], local_y[N]; int i, j; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &size); // Initialize A and x if (rank == 0) { for (i = 0; i < N; i++) { for (j = 0; j < N; j++) { A[i][j] = i * N + j; } x[i] = i; } } // Distribute x to all processes MPI_Bcast(x, N, MPI_INT, 0, MPI_COMM_WORLD); // Scatter A to all processes int local_n = N / size; int local_A[N][local_n]; MPI_Scatter(A, N * local_n, MPI_INT, local_A, N * local_n, MPI_INT, 0, MPI_COMM_WORLD); // Compute local_y for (i = 0; i < local_n; i++) { local_y[i] = 0; for (j = 0; j < N; j++) { local_y[i] += local_A[i][j] * x[j]; } } // Gather local_y to y MPI_Gather(local_y, local_n, MPI_INT, y, local_n, MPI_INT, 0, MPI_COMM_WORLD); // Print y on rank 0 if (rank == 0) { printf("y = ["); for (i = 0; i < N; i++) { printf(" %d", y[i]); } printf(" ]\n"); } MPI_Finalize(); return 0; } ``` 在这个示例中,我们使用MPI的`MPI_Bcast`、`MPI_Scatter`和`MPI_Gather`函数来分别广播向量x、将矩阵A分发给各个进程和收集各进程的结果。然后,每个进程计算本地部分的结果,并将结果存储在`local_y`数组中。最后,进程0收集所有进程的结果并打印输出y。 需要注意的是,在这个示例中,我们假设矩阵A的维度能够被进程数整除。如果不能被整除,则需要进行适当的调整。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值