MP2的目标:Implement a basic dense matrix multiplication routine.
主要会考察以下几个方面,在代码"//@@"下面添加你自己的代码。
allocate device memory
copy host memory to device
initialize thread block and kernel grid dimensions
invoke CUDA kernel
copy results from device to host
deallocate device memory
具体的算法讲解可以看该课的lecture 3 video.
源码如下:
// MP 2: Due Sunday, Dec 16, 2012 at 11:59 p.m. PST
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y*blockDim.y+threadIdx.y;
int Col = blockIdx.x*blockDim.x+threadIdx.x;
if (numAColumns != numBRows || Row >= numCRows || Col >= numCColumns) return;
float Pvalue = 0;
for (int k = 0; k < numAColumns; ++k)
Pvalue += A[Row*numAColumns+k] * B[k*numBColumns+Col];
C[Row*numCColumns+Col] = Pvalue;
}
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float *) malloc(numCRows * numCColumns * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void **)&deviceA, numARows * numAColumns * sizeof(float));
cudaMalloc((void **)&deviceB, numBRows * numBColumns * sizeof(float));
cudaMalloc((void **)&deviceC, numCRows * numCColumns * sizeof(float));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy( deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy( deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
const int TILE_WIDTH = 8;
dim3 dimGrid((numCColumns+TILE_WIDTH-1)/TILE_WIDTH,(numCRows+TILE_WIDTH-1)/TILE_WIDTH,1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, \
numARows, numAColumns, \
numBRows, numBColumns, \
numCRows, numCColumns);
cudaThreadSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy( hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost );
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
cudaFree(deviceA);
cudaFree(deviceB);
cudaFree(deviceC);
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}
之前的博文中《Hetergeneous Parallel Programming编程作业之MP0: Initial Lab Tour with Device Query》,所需的源文件:https://github.com/ashwin/coursera-heterogeneous. 用src/GenDataMP2.cpp生成可执行文件程序GenDataMP2,然后用它生成矩阵A, B, C. 这里 C = A x B. 如输入矩阵A为10 x 20,B为20 x 5
执行:
$ ./GenDataMP2 10 20 5
就会产生A, B, C,其中C为输出矩阵C = A x B.
之后我们执行MP2的程序:
$ optirun ./MP2 matA.txt matB.txt matC.txt
运行结果:
[Generic] 0.0003601920 Importing data and creating memory on host
Trace main::55 The dimensions of A are 10 x 20
Trace main::56 The dimensions of B are 20 x 5
[GPU ] 0.0803773440 Allocating GPU memory.
[GPU ] 0.0000258560 Copying input memory to the GPU.
[Compute] 0.0000407040 Performing CUDA computation
[Copy ] 0.0000179200 Copying output memory to the CPU
[GPU ] 0.0001228800 Freeing GPU Memory
All tests passed!
本地测试完成后,可以将代码cp到在线提交系统进行测试,一共有6个数据集,测试通过,完成。
可能遇到的问题:
这个程序我反复测试了N回,才把所有6个数据集过了,有几个难点:
1. Block, Grid的设定, 因为它可能是正好整除的,详见代码Line 75.
2. 就是矩阵的长宽定义,因为测试数据的矩阵并不一定是正方形型的,也可能是长方形型的。
3. 此外,就是Row, Col的概念要清晰。