使用CUDA和瓦片算法计算矩阵乘法

有很多网友分享的瓦片算法实现中,线程函数的代码有些问题(例如只对整倍块的矩阵能计算出正确结果,而对非整倍块的矩阵没有考虑到为残余缓存清零,导致计算结果错误),请看本例中的线程函数及相应注释

​#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>

template <int T> __global__ void MatrixMulCUDA(float *C, float *A,float *B,int wA,int hA,int wB,int hB) {
    int tx = threadIdx.x;
    int ty = threadIdx.y;
	//当前线程对应的矩阵C的元素位置
	int row = blockIdx.y * T + ty;
	int col = blockIdx.x * T + tx;

    float c = 0;
	__shared__ float As[T][T];
	__shared__ float Bs[T][T];

	//每个Block都将遍历A的一整行块和B的一整列块
	//每个线程主要负责一行和一列的内积,另外还负责为当前循环中所计算的块填充一个元素到共享内存中
	for (int i = 0; i < ceil(1.0*wA/T);i++) {

		if (row < hA && i * T + tx < wA)
			As[ty][tx] = A[row*wA+i*T+tx];//所有计算单元同时加载,所以下面的for循环中As和Bs都已配置完成
		else
			As[ty][tx] = 0;

		if (col < wB && i * T + ty < hB)
			Bs[ty][tx] = B[(i * T + ty) * wB + col];
		else
			Bs[ty][tx] = 0;

		//让同一块中的不同线程指令流同步,保证共享内存中矩阵块的元素全部加载
		__syncthreads();//各线程执行到此函数时等待,直到全部线程同步
#pragma unroll
		//虽然外层循环是面向Block的,但这里内层循环只计算了两块中某行和某列的
		for (int j = 0; j < T; ++j)
			c += As[ty][j] * Bs[j][tx];
		__syncthreads();
	}
	if(row < hA && col < wB)
		C[row*wB+col] = c;

}

void ConstantInit(float *data, int size) {
    for (int i = 0; i < size; ++i) {
        data[i] = rand() % 100;
		//printf("%.0f,", data[i]);
    }
	//printf("\n");
}


int MatrixMultiply(int block_size, const dim3 &dimsA,const dim3 &dimsB) {
    unsigned int size_A = dimsA.x * dimsA.y;
    unsigned int mem_size_A = sizeof(float) * size_A;
    float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
    unsigned int size_B = dimsB.x * dimsB.y;
    unsigned int mem_size_B = sizeof(float) * size_B;
    float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
	ConstantInit(h_A, size_A);
	ConstantInit(h_B, size_B);

	dim3 dimsC(dimsB.x, dimsA.y, 1);
	unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
	float* h_C = reinterpret_cast<float*>(malloc(mem_size_C));
    if (h_C == NULL) {
        fprintf(stderr, "Failed to allocate host matrix C\n");
        exit(EXIT_FAILURE);
    }

    float *d_A, *d_B, *d_C;

	//在显存中分配存储
    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
    checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));

	//创建流对象,用于任务级(Grid)同步
	cudaStream_t stream;
	checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));

    checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
    checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));

	//启动计算
    dim3 threads(block_size, block_size);
    dim3 grid(ceil(1.0*dimsB.x / threads.x), ceil(1.0 * dimsA.y / threads.y));

    if (block_size == 16) 
        MatrixMulCUDA<16> <<< grid, threads,16*16, stream>>>
			(d_C, d_A, d_B,dimsA.x, dimsA.y, dimsB.x, dimsB.y);
    else 
        MatrixMulCUDA<32> <<< grid, threads,32*32, stream>>>
			(d_C, d_A, d_B,dimsA.x, dimsA.y, dimsB.x, dimsB.y);
    
    checkCudaErrors(cudaStreamSynchronize(stream));//同步stream上的线程

	//获取计算结果
	checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
	checkCudaErrors(cudaStreamSynchronize(stream));

	for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++)
		printf("%.0f,", h_C[i]);

	//计算性能测试
	cudaEvent_t start, stop;//创建事件对象,用于记录计算时间
	int nIter = 300;
	checkCudaErrors(cudaEventCreate(&start));
	checkCudaErrors(cudaEventCreate(&stop));
	checkCudaErrors(cudaEventRecord(start, stream));
    for (int j = 0; j < nIter; j++) {
        if (block_size == 16) {
            MatrixMulCUDA<16> <<<grid, threads, 16 * 16, stream>>>(d_C, d_A, d_B,
				dimsA.x, dimsA.y, dimsB.x, dimsB.y);
        } else {
            MatrixMulCUDA<32> <<<grid, threads, 32 * 32, stream>>>(d_C, d_A, d_B,
				dimsA.x, dimsA.y, dimsB.x, dimsB.y);
        }
    }
    checkCudaErrors(cudaEventRecord(stop, stream));
    checkCudaErrors(cudaEventSynchronize(stop));

    float msecTotal = 0.0f;
    checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));

    float msecPerMatrixMul = msecTotal / nIter;
    double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x)*static_cast<double>(dimsA.y)*static_cast<double>(dimsB.x);
    double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
    printf("\n浮点计算性能:%.2fGFlop/s, 单趟耗时: %.3fms\n",gigaFlops,msecPerMatrixMul);

	//清理
    free(h_A);
    free(h_B);
    free(h_C);
    checkCudaErrors(cudaFree(d_A));
    checkCudaErrors(cudaFree(d_B));
    checkCudaErrors(cudaFree(d_C));
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
	checkCudaErrors(cudaStreamDestroy(stream));

    return EXIT_SUCCESS;
}


int main(int argc, char **argv) {
    // This will pick the best possible CUDA capable device, otherwise
    // override the device ID based on input provided at the command line
    int dev = findCudaDevice(argc, (const char **)argv);

    int block_size = 16;

    dim3 dimsA(50 * block_size+4, 3 * block_size-1, 1);
    dim3 dimsB(3 * block_size+5, 50 * block_size+4, 1);

    printf("A(%d*%d), B(%d*%d)\n", dimsA.y, dimsA.x,dimsB.y, dimsB.x);
    MatrixMultiply(block_size, dimsA, dimsB);

	return 0;
}


​

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值