有很多网友分享的瓦片算法实现中,线程函数的代码有些问题(例如只对整倍块的矩阵能计算出正确结果,而对非整倍块的矩阵没有考虑到为残余缓存清零,导致计算结果错误),请看本例中的线程函数及相应注释
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
#include <helper_functions.h>
#include <helper_cuda.h>
template <int T> __global__ void MatrixMulCUDA(float *C, float *A,float *B,int wA,int hA,int wB,int hB) {
int tx = threadIdx.x;
int ty = threadIdx.y;
//当前线程对应的矩阵C的元素位置
int row = blockIdx.y * T + ty;
int col = blockIdx.x * T + tx;
float c = 0;
__shared__ float As[T][T];
__shared__ float Bs[T][T];
//每个Block都将遍历A的一整行块和B的一整列块
//每个线程主要负责一行和一列的内积,另外还负责为当前循环中所计算的块填充一个元素到共享内存中
for (int i = 0; i < ceil(1.0*wA/T);i++) {
if (row < hA && i * T + tx < wA)
As[ty][tx] = A[row*wA+i*T+tx];//所有计算单元同时加载,所以下面的for循环中As和Bs都已配置完成
else
As[ty][tx] = 0;
if (col < wB && i * T + ty < hB)
Bs[ty][tx] = B[(i * T + ty) * wB + col];
else
Bs[ty][tx] = 0;
//让同一块中的不同线程指令流同步,保证共享内存中矩阵块的元素全部加载
__syncthreads();//各线程执行到此函数时等待,直到全部线程同步
#pragma unroll
//虽然外层循环是面向Block的,但这里内层循环只计算了两块中某行和某列的
for (int j = 0; j < T; ++j)
c += As[ty][j] * Bs[j][tx];
__syncthreads();
}
if(row < hA && col < wB)
C[row*wB+col] = c;
}
void ConstantInit(float *data, int size) {
for (int i = 0; i < size; ++i) {
data[i] = rand() % 100;
//printf("%.0f,", data[i]);
}
//printf("\n");
}
int MatrixMultiply(int block_size, const dim3 &dimsA,const dim3 &dimsB) {
unsigned int size_A = dimsA.x * dimsA.y;
unsigned int mem_size_A = sizeof(float) * size_A;
float *h_A = reinterpret_cast<float *>(malloc(mem_size_A));
unsigned int size_B = dimsB.x * dimsB.y;
unsigned int mem_size_B = sizeof(float) * size_B;
float *h_B = reinterpret_cast<float *>(malloc(mem_size_B));
ConstantInit(h_A, size_A);
ConstantInit(h_B, size_B);
dim3 dimsC(dimsB.x, dimsA.y, 1);
unsigned int mem_size_C = dimsC.x * dimsC.y * sizeof(float);
float* h_C = reinterpret_cast<float*>(malloc(mem_size_C));
if (h_C == NULL) {
fprintf(stderr, "Failed to allocate host matrix C\n");
exit(EXIT_FAILURE);
}
float *d_A, *d_B, *d_C;
//在显存中分配存储
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_A), mem_size_A));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_B), mem_size_B));
checkCudaErrors(cudaMalloc(reinterpret_cast<void **>(&d_C), mem_size_C));
//创建流对象,用于任务级(Grid)同步
cudaStream_t stream;
checkCudaErrors(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
checkCudaErrors(cudaMemcpyAsync(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice, stream));
checkCudaErrors(cudaMemcpyAsync(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice, stream));
//启动计算
dim3 threads(block_size, block_size);
dim3 grid(ceil(1.0*dimsB.x / threads.x), ceil(1.0 * dimsA.y / threads.y));
if (block_size == 16)
MatrixMulCUDA<16> <<< grid, threads,16*16, stream>>>
(d_C, d_A, d_B,dimsA.x, dimsA.y, dimsB.x, dimsB.y);
else
MatrixMulCUDA<32> <<< grid, threads,32*32, stream>>>
(d_C, d_A, d_B,dimsA.x, dimsA.y, dimsB.x, dimsB.y);
checkCudaErrors(cudaStreamSynchronize(stream));//同步stream上的线程
//获取计算结果
checkCudaErrors(cudaMemcpyAsync(h_C, d_C, mem_size_C, cudaMemcpyDeviceToHost, stream));
checkCudaErrors(cudaStreamSynchronize(stream));
for (int i = 0; i < static_cast<int>(dimsC.x * dimsC.y); i++)
printf("%.0f,", h_C[i]);
//计算性能测试
cudaEvent_t start, stop;//创建事件对象,用于记录计算时间
int nIter = 300;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
checkCudaErrors(cudaEventRecord(start, stream));
for (int j = 0; j < nIter; j++) {
if (block_size == 16) {
MatrixMulCUDA<16> <<<grid, threads, 16 * 16, stream>>>(d_C, d_A, d_B,
dimsA.x, dimsA.y, dimsB.x, dimsB.y);
} else {
MatrixMulCUDA<32> <<<grid, threads, 32 * 32, stream>>>(d_C, d_A, d_B,
dimsA.x, dimsA.y, dimsB.x, dimsB.y);
}
}
checkCudaErrors(cudaEventRecord(stop, stream));
checkCudaErrors(cudaEventSynchronize(stop));
float msecTotal = 0.0f;
checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop));
float msecPerMatrixMul = msecTotal / nIter;
double flopsPerMatrixMul = 2.0 * static_cast<double>(dimsA.x)*static_cast<double>(dimsA.y)*static_cast<double>(dimsB.x);
double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f);
printf("\n浮点计算性能:%.2fGFlop/s, 单趟耗时: %.3fms\n",gigaFlops,msecPerMatrixMul);
//清理
free(h_A);
free(h_B);
free(h_C);
checkCudaErrors(cudaFree(d_A));
checkCudaErrors(cudaFree(d_B));
checkCudaErrors(cudaFree(d_C));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaStreamDestroy(stream));
return EXIT_SUCCESS;
}
int main(int argc, char **argv) {
// This will pick the best possible CUDA capable device, otherwise
// override the device ID based on input provided at the command line
int dev = findCudaDevice(argc, (const char **)argv);
int block_size = 16;
dim3 dimsA(50 * block_size+4, 3 * block_size-1, 1);
dim3 dimsB(3 * block_size+5, 50 * block_size+4, 1);
printf("A(%d*%d), B(%d*%d)\n", dimsA.y, dimsA.x,dimsB.y, dimsB.x);
MatrixMultiply(block_size, dimsA, dimsB);
return 0;
}