CUDA任意维度的矩阵相乘

最新推荐文章于 2021-06-08 14:32:46 发布

Augusdi

最新推荐文章于 2021-06-08 14:32:46 发布

阅读量4.6k

点赞数 1

分类专栏： CUDA

CUDA 专栏收录该内容

107 篇文章 56 订阅

订阅专栏

__global__ void matrixMul( float* A, float* B, float* C, int hA,int wA, int wB)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;


float Csub = 0.0f;



for (int j=0;j<wA;j+=BLOCK_DIM)
{
__shared__ float AS[BLOCK_DIM][BLOCK_DIM];
__shared__ float BS[BLOCK_DIM][BLOCK_DIM];


if(((by*BLOCK_DIM+ty)<hA)&&((tx+j)<wA))
{
AS[ty][tx] = A[(by*BLOCK_DIM+ty)*wA+tx+j];
}
else
{
AS[ty][tx] = 0;
}
if(((ty+j)<wA)&&((bx*BLOCK_DIM+tx)<wB))
{
BS[ty][tx] = B[(ty+j)*wB+bx*BLOCK_DIM+tx];
}
else
{
BS[ty][tx] = 0;
}


__syncthreads();


//Kahan's Summation Formula
for (int k = 0; k < BLOCK_DIM; ++k)
{
Csub += AS[ty][k]*BS[k][tx];
}


__syncthreads();
}

if(((by*BLOCK_DIM+ty)<hA)&&((bx*BLOCK_DIM+tx)<wB))
{
int c = wB*BLOCK_DIM*by + BLOCK_DIM*bx; 
C[c + wB * ty + tx] = Csub; 


}


}




//



int wA =  src.feat[t].size[2];;                       //1的宽 2的高
int wB = src.feat[t].size[0]*src.feat[t].size[1];  //2 3矩阵的宽
int hA = pmatrix.size[1];                  //1 3矩阵的高


dim3 mygrid(((wB+BLOCK_DIM-1)/BLOCK_DIM),(hA+BLOCK_DIM-1)/BLOCK_DIM);
dim3 myblock(BLOCK_DIM,BLOCK_DIM);
matrixMul<<<mygrid,myblock>>>(dev_matrix,dev_pyra[t],dev_pyrapro[t],hA,wA,wB);