MP2的目标:Implement a basic dense matrix multiplication routine.
allocate device memory
copy host memory to device
initialize thread block and kernel grid dimensions
invoke CUDA kernel
copy results from device to host
deallocate device memory
具体的算法讲解可以看该课的lecture 3 video.
// MP 2: Due Sunday, Dec 16, 2012 at 11:59 p.m. PST
#include <wb.h>
#define wbCheck(stmt) do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
return -1; \
} \
} while(0)
// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
int numARows, int numAColumns,
int numBRows, int numBColumns,
int numCRows, int numCColumns) {
//@@ Insert code to implement matrix multiplication here
int Row = blockIdx.y*blockDim.y+threadIdx.y;
int Col = blockIdx.x*blockDim.x+threadIdx.x;
if (numAColumns != numBRows || Row >= numCRows || Col >= numCColumns) return;
float Pvalue = 0;
for (int k = 0; k < numAColumns; ++k)
Pvalue += A[Row*numAColumns+k] * B[k*numBColumns+Col];
C[Row*numCColumns+Col] = Pvalue;
int main(int argc, char ** argv) {
wbArg_t args;
float * hostA; // The A matrix
float * hostB; // The B matrix
float * hostC; // The output C matrix
float * deviceA;
float * deviceB;
float * deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
hostC = (float *) malloc(numCRows * numCColumns * sizeof(float));
wbTime_stop(Generic, "Importing data and creating memory on host");
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
cudaMalloc((void **)&deviceA, numARows * numAColumns * sizeof(float));
cudaMalloc((void **)&deviceB, numBRows * numBColumns * sizeof(float));
cudaMalloc((void **)&deviceC, numCRows * numCColumns * sizeof(float));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
cudaMemcpy( deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy( deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
const int TILE_WIDTH = 8;
dim3 dimGrid((numCColumns+TILE_WIDTH-1)/TILE_WIDTH,(numCRows+TILE_WIDTH-1)/TILE_WIDTH,1);
dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiply<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, \
numARows, numAColumns, \
numBRows, numBColumns, \
numCRows, numCColumns);
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
cudaMemcpy( hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost );
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
return 0;
之前的博文中《Hetergeneous Parallel Programming编程作业之MP0: Initial Lab Tour with Device Query》,所需的源文件: 用src/GenDataMP2.cpp生成可执行文件程序GenDataMP2,然后用它生成矩阵A, B, C. 这里 C = A x B. 如输入矩阵A为10 x 20,B为20 x 5
$ ./GenDataMP2 10 20 5
就会产生A, B, C,其中C为输出矩阵C = A x B.
$ optirun ./MP2 matA.txt matB.txt matC.txt
[Generic] 0.0003601920 Importing data and creating memory on host
Trace main::55 The dimensions of A are 10 x 20
Trace main::56 The dimensions of B are 20 x 5
[GPU ] 0.0803773440 Allocating GPU memory.
[GPU ] 0.0000258560 Copying input memory to the GPU.
[Compute] 0.0000407040 Performing CUDA computation
[Copy ] 0.0000179200 Copying output memory to the CPU
[GPU ] 0.0001228800 Freeing GPU Memory
All tests passed!
1. Block, Grid的设定, 因为它可能是正好整除的,详见代码Line 75.
2. 就是矩阵的长宽定义,因为测试数据的矩阵并不一定是正方形型的,也可能是长方形型的。
3. 此外,就是Row, Col的概念要清晰。