Hetergeneous Parallel Programming编程作业之MP2: Basic Matrix-Matrix Multiplication

最新推荐文章于 2020-11-30 06:43:58 发布

lucktroy

最新推荐文章于 2020-11-30 06:43:58 发布

阅读量1.3k

点赞数

分类专栏： GPU CUDA Coursera

本文链接：https://blog.csdn.net/lucktroy/article/details/8449503

版权

GPU 同时被 3 个专栏收录

8 篇文章 0 订阅

订阅专栏

CUDA

8 篇文章 0 订阅

订阅专栏

Coursera

4 篇文章 0 订阅

订阅专栏

MP2的目标：Implement a basic dense matrix multiplication routine.

主要会考察以下几个方面，在代码"//@@"下面添加你自己的代码。

allocate device memory
copy host memory to device
initialize thread block and kernel grid dimensions
invoke CUDA kernel
copy results from device to host
deallocate device memory

具体的算法讲解可以看该课的lecture 3 video.

源码如下：

// MP 2: Due Sunday, Dec 16, 2012 at 11:59 p.m. PST
#include    <wb.h>

#define wbCheck(stmt) do {                                 \
        cudaError_t err = stmt;                            \
        if (err != cudaSuccess) {                          \
            wbLog(ERROR, "Failed to run stmt ", #stmt);    \
            return -1;                                     \
        }                                                  \
    } while(0)

// Compute C = A * B
__global__ void matrixMultiply(float * A, float * B, float * C,
			       int numARows, int numAColumns,
			       int numBRows, int numBColumns,
			       int numCRows, int numCColumns) {
    //@@ Insert code to implement matrix multiplication here
    int Row = blockIdx.y*blockDim.y+threadIdx.y;
    int Col = blockIdx.x*blockDim.x+threadIdx.x;
    if (numAColumns != numBRows || Row >= numCRows || Col >= numCColumns) return;
	float Pvalue = 0;
	for (int k = 0; k < numAColumns; ++k)
            Pvalue += A[Row*numAColumns+k] * B[k*numBColumns+Col];
	C[Row*numCColumns+Col] = Pvalue;
}

int main(int argc, char ** argv) {
    wbArg_t args;
    float * hostA; // The A matrix
    float * hostB; // The B matrix
    float * hostC; // The output C matrix
    float * deviceA;
    float * deviceB;
    float * deviceC;
    int numARows; // number of rows in the matrix A
    int numAColumns; // number of columns in the matrix A
    int numBRows; // number of rows in the matrix B
    int numBColumns; // number of columns in the matrix B
    int numCRows; // number of rows in the matrix C (you have to set this)
    int numCColumns; // number of columns in the matrix C (you have to set this)

    args = wbArg_read(argc, argv);

    wbTime_start(Generic, "Importing data and creating memory on host");
    hostA = (float *) wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
    hostB = (float *) wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
    //@@ Set numCRows and numCColumns
    numCRows = numARows;
    numCColumns = numBColumns;
    //@@ Allocate the hostC matrix
    hostC = (float *) malloc(numCRows * numCColumns * sizeof(float));

    wbTime_stop(Generic, "Importing data and creating memory on host");

    wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
    wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);

    wbTime_start(GPU, "Allocating GPU memory.");
    //@@ Allocate GPU memory here
    cudaMalloc((void **)&deviceA, numARows * numAColumns * sizeof(float));
    cudaMalloc((void **)&deviceB, numBRows * numBColumns * sizeof(float));
    cudaMalloc((void **)&deviceC, numCRows * numCColumns * sizeof(float));

    wbTime_stop(GPU, "Allocating GPU memory.");

    wbTime_start(GPU, "Copying input memory to the GPU.");
    //@@ Copy memory to the GPU here
    cudaMemcpy( deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy( deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);

    wbTime_stop(GPU, "Copying input memory to the GPU.");

    //@@ Initialize the grid and block dimensions here
    const int TILE_WIDTH = 8;
    dim3 dimGrid((numCColumns+TILE_WIDTH-1)/TILE_WIDTH,(numCRows+TILE_WIDTH-1)/TILE_WIDTH,1);
    dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1);
    wbTime_start(Compute, "Performing CUDA computation");
    //@@ Launch the GPU Kernel here
    matrixMultiply<<<dimGrid, dimBlock>>>(deviceA, deviceB, deviceC, \
						numARows, numAColumns, \
						numBRows, numBColumns, \
						numCRows, numCColumns);

    cudaThreadSynchronize();
    wbTime_stop(Compute, "Performing CUDA computation");

    wbTime_start(Copy, "Copying output memory to the CPU");
    //@@ Copy the GPU memory back to the CPU here
    cudaMemcpy( hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost );

    wbTime_stop(Copy, "Copying output memory to the CPU");

    wbTime_start(GPU, "Freeing GPU Memory");
    //@@ Free the GPU memory here
    cudaFree(deviceA);
    cudaFree(deviceB);
    cudaFree(deviceC);

    wbTime_stop(GPU, "Freeing GPU Memory");

    wbSolution(args, hostC, numCRows, numCColumns);

    free(hostA);
    free(hostB);
    free(hostC);

    return 0;
}

之前的博文中《Hetergeneous Parallel Programming编程作业之MP0: Initial Lab Tour with Device Query》，所需的源文件：https://github.com/ashwin/coursera-heterogeneous. 用src/GenDataMP2.cpp生成可执行文件程序GenDataMP2，然后用它生成矩阵A, B, C. 这里 C = A x B. 如输入矩阵A为10 x 20，B为20 x 5

执行:

$ ./GenDataMP2 10 20 5

就会产生A, B, C，其中C为输出矩阵C = A x B.

之后我们执行MP2的程序:

$ optirun ./MP2 matA.txt matB.txt matC.txt

运行结果：

[Generic] 0.0003601920 Importing data and creating memory on host
Trace main::55 The dimensions of A are 10 x 20
Trace main::56 The dimensions of B are 20 x 5
[GPU    ] 0.0803773440 Allocating GPU memory.
[GPU    ] 0.0000258560 Copying input memory to the GPU.
[Compute] 0.0000407040 Performing CUDA computation
[Copy   ] 0.0000179200 Copying output memory to the CPU
[GPU    ] 0.0001228800 Freeing GPU Memory
All tests passed!

本地测试完成后，可以将代码cp到在线提交系统进行测试，一共有6个数据集，测试通过，完成。

可能遇到的问题：

这个程序我反复测试了N回，才把所有6个数据集过了，有几个难点：

1. Block, Grid的设定, 因为它可能是正好整除的，详见代码Line 75.

2. 就是矩阵的长宽定义，因为测试数据的矩阵并不一定是正方形型的，也可能是长方形型的。

3. 此外，就是Row, Col的概念要清晰。

lucktroy

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Hetergeneous Parallel Programming编程作业之MP2: Basic Matrix-Matrix Multiplication

MP2的目标：Implement a basic dense matrix multiplication routine.主要会考察以下几个方面，在代码"//@@"下面添加你自己的代码。allocate device memorycopy host memory to deviceinitialize thread block and kernel grid dimension
复制链接

扫一扫

专栏目录