Makefile
CC = /usr/bin/gcc
NVCC = /usr/local/cuda-10.2/bin/nvcc
CFLAGS = -fopenmp -g -w -O4 -I..
MPI = -I /usr/local/mpich/include
LIB = -L /usr/local/mpich/lib -lmpich -lopa -lmpl -lpthread
all:
${NVCC} ${MPI} ${LIB} matvec.cu -o newfloatmatvec
run:
./newfloatmatvec 4 4 4 4 1 -v -p
clean:
-rm newfloat*
Matrix.cu
#include <stdio.h>
#include <cuda.h>
#include "mpi.h"
//------------------------------------------------------------------------------------------------------------------------------------------
#define BLOCKSIZE 2
//--------------------------------------------------------------------------------------------------------------------------------------------
int CheckDevice(int);
//--------------------------------------------------------------------------------------------------------------------------------------------
//Pragma routine to report the detail of cuda error
#define CUDA_SAFE_CALL(call) \
do{ \
cudaError_t err = call; \
if(err != cudaSuccess) \
{ \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",\
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(1); \
} \
} while (0) \
//----------------------------------------------------------------------------------------------------------------------------------------
void initial(int *matrix, int size)
{
for (int i = 0; i < size; i++)
{
matrix[i] = (int)(rand() % 10 + 1);
}
}
void printMatrix(int *matrix, int row, int col)
{
int *p = matrix;
for (int y = 0; y < row; y++)
{
for (int x = 0; x < col; x++)
{
printf("%5d", p[x]);
}
p = p + col;
printf("\n");
}
}
//Kernel that performs Matrix Vector Multiplication
__global__ void MatrixVectorMultiplication(int *Matrix, int *Vector, int *Solution, int A_row, int A_col, int B_row, int B_col, int VectorLength, int ScatterSize, int ThreadDim, int MyRank, int NumberofProcessors)
{
/*
int tidx = threadIdx.x;
for (int i = 0; i < RowsNo / NumberofProcessors; i++) {
for (tidx = 0; tidx < ColsNo2; tidx++) {
int sum = 0.0;
for (int k = 0; k < RowsNo2; k++)
sum = sum + Matrix[i * ColsNo + k] * Vector[k * ColsNo2 + tidx];
Solution[i * ColsNo2 + tidx] = sum;
}
}
*/
int ix = threadIdx.x + blockDim.x*blockIdx.x;//col number
int iy = threadIdx.y + blockDim.y*blockIdx.y;//row number
if (ix < A_col && iy < A_row)
{
double sum = 0;
for (int k = 0; k < B_row; k++)
{
sum = sum + Matrix[iy * A_col + k] * Vector[k * B_col + ix];
}
Solution[iy * B_col + ix] = sum;
}
__syncthreads();
}//End of Matrix Vector Multiplication Device Function
//---------------------------------------------------------------------------------------------------------------------------------------
int main(int argc, char **argv)
{
int MyRank, NumberOfProcessors;
int Root = 0, Index;
volatile int Status = 1;
int *MatrixA=0, *VectorB, *ResultVector=0, *MatrixB=0;
int *MyMatrixA, *MyResultMatrix;
int *DeviceMyMatrixA, *DeviceMyResultVector, *DeviceMatrixB, *CPUResultVector;
int RowsNo, ColsNo, RowsNo2, ColsNo2, ScatterSize, IndexCol, IndexValue, DeviceStatus;
volatile int resultsize,matrixbsize, pinned;
int print = 0;
int verify = 0;
//MPI Intialization
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &MyRank);
MPI_Comm_size(MPI_COMM_WORLD, &NumberOfProcessors);
//Checking if valid number of arguements have been passed
if (argc < 6)
{
if (MyRank == Root)
printf("Usage:< mpirun >< -n >< Number of processors >< ./Program Name >< Number of Rows of Matri x>< Number of Columns of Matrix >< Rows of Matrix 2 > <Coloumns of Matrix 1> <1 for pinned memory, 2 for unpinnned> <-v if verification is required> <-p if print is required>\n");
MPI_Finalize();
exit(-1);
}
if ((argc >= 7 && strcmp(argv[6], "-v") == 0)) {
verify = 1;
}
if ((argc == 8 && strcmp(argv[7], "-p") == 0) || (argc == 7 && strcmp(argv[6], "-p") == 0)) {
print = 1;
}
//Assigning values to RowsNo, ColsNo, VectorSize from the arguements passed
RowsNo = atoi(argv[1]);
ColsNo = atoi(argv[2]);
RowsNo2 = atoi(argv[3]);
ColsNo2 = atoi(argv[4]);
pinned = atoi(argv[5]);
resultsize = RowsNo * ColsNo2;
matrixbsize = RowsNo2 * ColsNo2;
if (MyRank == 0)
printf("Resultant Matrix Number of Elements is %d\n\n\n", matrixbsize);
int elements;
//Checking if columns is equal to vector size
if (ColsNo != RowsNo2)
{
if (MyRank == Root)
printf("Entered wrong input, Number of columns of matrix should be equal to number of rows \n");
MPI_Finalize();
exit(-1);
}
if (RowsNo < NumberOfProcessors)
{
if (MyRank == Root)
printf("Given number of Rows of the matrix should be more than number of processors \n");
MPI_Finalize();
exit(-1);
}
//Checking if Matrix can be distributed evenly to all the nodes
if (RowsNo % NumberOfProcessors != 0)
{
if (MyRank == Root)
printf("The Rows of the matrix can not be distributed evenly among processors \n");
MPI_Finalize();
exit(-1);
}
//Root node intializes the Matrix, Vector and Result Vector
if (MyRank == Root)
{
MatrixA = (int *)malloc(RowsNo * ColsNo * sizeof(int));
MatrixB = (int *)malloc(RowsNo2 * ColsNo2 * sizeof(int));
ResultVector = (int *)malloc(RowsNo * ColsNo2* sizeof(int));
initial(MatrixA,RowsNo * ColsNo);
initial(MatrixB,RowsNo2 * ColsNo2);
printf("Matrix A initialized\n");
printMatrix(MatrixA,RowsNo,ColsNo);
printf("Matrix B initilized\n");
printMatrix(MatrixB,RowsNo2,ColsNo2);
}
MPI_Barrier(MPI_COMM_WORLD);
// MPI_Bcast(Status, 1, MPI_INT, Root, MPI_COMM_WORLD);
//Allocating memory for the Vector by all nodes expect root node
if (MyRank != Root)
MatrixB = (int *)malloc(matrixbsize * sizeof(int));
//Broad casting the Vector to all the nodes from root node
MPI_Bcast(MatrixB, matrixbsize, MPI_INT, Root, MPI_COMM_WORLD);
//Calculating the Scatter size of the Matrix
ScatterSize = RowsNo / NumberOfProcessors;
elements = (RowsNo*ColsNo2) / NumberOfProcessors;
//Allocating the memory on the host for the MyMatrixA and MyResultVector by all nodes
MyMatrixA = (int *)malloc(ScatterSize * ColsNo * sizeof(int));
if (MyMatrixA == NULL)
Status = 0;
MyResultMatrix = (int *)malloc(elements * sizeof(int));
if (MyResultMatrix == NULL)
Status = 0;
//Distributing the Matrix among to all the nodes
MPI_Scatter(MatrixA, ScatterSize * ColsNo, MPI_INT, MyMatrixA, ScatterSize * ColsNo, MPI_INT, Root, MPI_COMM_WORLD);
DeviceStatus = CheckDevice(MyRank);
if (DeviceStatus == 0)
{
printf("Processor with rank %d doing partial product of two vectors on CPU \n", MyRank);
for (Index = 0; Index < ScatterSize; Index++)
{
MyResultMatrix[Index] = 0;
IndexValue = Index * ColsNo;
for (IndexCol = 0; IndexCol < ColsNo; IndexCol++)
MyResultMatrix[Index] += (MyMatrixA[IndexValue++] * VectorB[IndexCol]);
}
}else{
if (pinned == 1) {
cudaSetDevice(MyRank);
printf("Pinned mode\n");
//Allocating the Memory on the device memory
CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaHostAllocDefault));
CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMatrixB, matrixbsize * sizeof(int), cudaHostAllocDefault));
CUDA_SAFE_CALL(cudaHostAlloc((void **)&DeviceMyResultVector, elements * sizeof(int), cudaHostAllocDefault));
//Copying the data from host to device
cudaMemcpyAsync((void *)DeviceMyMatrixA, (void *)MyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpyAsync((void *)DeviceMatrixB, (void *)MatrixB, matrixbsize * sizeof(int), cudaMemcpyHostToDevice);
}
else {
cudaSetDevice(MyRank);
printf("Unpinned mode\n");
//Allocating the Memory on the device memory
CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMyMatrixA, ScatterSize * ColsNo * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMatrixB, matrixbsize * sizeof(int)));
CUDA_SAFE_CALL(cudaMalloc((void **)&DeviceMyResultVector, elements * sizeof(int)));
//Copying the data from host to device
cudaMemcpy((void *)DeviceMyMatrixA, (void *)MyMatrixA, ScatterSize * ColsNo * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy((void *)DeviceMatrixB, (void *)MatrixB, matrixbsize * sizeof(int), cudaMemcpyHostToDevice);
}
cudaSetDevice(MyRank);
//Calling the kernel which performs Matrix Vector Product
int dimx = ScatterSize;
int dimy = ScatterSize;
dim3 block(dimx, dimy);
dim3 grid((ColsNo2 + block.x - 1) / block.x, (ScatterSize + block.y - 1) / block.y);
// dim3 grid((ScatterSize + block.x - 1) / block.x, (ColsNo2 + block.y - 1) / block.y);
MatrixVectorMultiplication << <grid, block >> > (DeviceMyMatrixA, DeviceMatrixB, DeviceMyResultVector, RowsNo, ColsNo, RowsNo2, ColsNo2, ColsNo, ScatterSize, BLOCKSIZE, MyRank, NumberOfProcessors);
printf("Matrix ResultMatrix:(%d×%d) <<<(%d,%d),(%d,%d)>>> \n",RowsNo,ColsNo2,grid.x,grid.y,block.x,block.y);
// MatrixVectorMultiplication << <1, 256 >> > (DeviceMyMatrixA, DeviceMatrixB, DeviceMyResultVector, RowsNo, ColsNo, RowsNo2, ColsNo2, ColsNo, ScatterSize, BLOCKSIZE, MyRank, NumberOfProcessors);
//Copying the value of patial result vector from device to host
cudaMemcpy((void *)MyResultMatrix, (void *)DeviceMyResultVector, elements * sizeof(int), cudaMemcpyDeviceToHost);
}
MPI_Barrier(MPI_COMM_WORLD);
//Root processor gathering from all nodes to get the final result vector
MPI_Gather(MyResultMatrix, elements, MPI_INT, ResultVector, elements, MPI_INT, Root, MPI_COMM_WORLD);
//To verify:
//Compute on CPU
if (MyRank == 0 && verify == 1) {
printf("\n\n\nVerification Done\n\n");
CPUResultVector = (int *)malloc(RowsNo * ColsNo2 * sizeof(int));
for (int i = 0; i < RowsNo; i++) {
for (int j = 0; j < ColsNo2; j++) {
int sum = 0.0;
for (int k = 0; k < RowsNo2; k++)
sum = sum + MatrixA[i * ColsNo + k] * MatrixB[k * ColsNo2 + j];
CPUResultVector[i * ColsNo2 + j] = sum;
}
}
for (Index = 0; Index < ColsNo2 * RowsNo; Index++)
{
int a = ResultVector[Index];
int b = CPUResultVector[Index];
if (a != b)
{
printf("Error in computation and values are %f and %f", ResultVector[Index], CPUResultVector[Index]);
}
}
}
//Root processor printing the resultant vector if print specified
if (MyRank == Root && print == 1)
{
printMatrix(ResultVector,RowsNo,ColsNo2);
//freeing the Vectors allocated by the root node
free(MatrixA);
free(MatrixB);
free(ResultVector);
}
if (MyRank == 0)
printf("\n\n Computation Done .....\n Exiting \n\n");
//Freeing the host memory
free(MyMatrixA);
free(MatrixB);
free(MyResultMatrix);
/*//Freeing the device memory
CUDA_SAFE_CALL( cudaFree( DeviceMyMatrixA ) );
CUDA_SAFE_CALL( cudaFree( DeviceMatrixB ) );
CUDA_SAFE_CALL( cudaFree( DeviceMyResultVector ) );*/
MPI_Finalize();
return(0);
}
//End of Main function
//---------------------------------------------------------------------------------------------------------------------------------------
int CheckDevice(int MyRank)
{
int DeviceCount, Device;
struct cudaDeviceProp Properties;
cudaGetDeviceCount(&DeviceCount);
if (DeviceCount >= 1)
{
cudaGetDevice(&Device);
cudaGetDeviceProperties(&Properties, Device);
printf("Processor with rank %d has the Device by name %s and computation is done on this device \n", MyRank, Properties.name);
}
return(DeviceCount);
}//End of function
//--------------------------------------------------------------------------------------------------------------