最近在做计算机视觉的问题,里面涉及到矩阵的乘法,包括A*B, A'B 和AB', 我们稍微改了下内容,不知道如何用GPU加速,希望大神指导一下,谢谢啦!
void gemm_nn(ptrdiff_t M, ptrdiff_t N, ptrdiff_t K,
float ALPHA,
float const *A, ptrdiff_t lda,
float const *B, ptrdiff_t ldb,
float *C, ptrdiff_t ldc,
ptrdiff_t numGroup, ptrdiff_t filtersVolume_step)
{
int i, j, k, g;
for (g = 0; g<numGroup; ++g)
{
ptrdiff_t filterGrpOffset = K * N * g;
ptrdiff_t tempGrpOffset = M*filtersVolume_step*g;
ptrdiff_t outputGrpOffset = M*N*g;
for (j = 0; j < N; ++j){
for (k = 0; k < K; ++k){
register float temp = ALPHA * (*(B + filterGrpOffset + j*ldb + k));
for (i = 0; i < M; ++i){
*(C + outputGrpOffset + j*ldc + i) += temp * (*(A + tempGrpOffset + k*lda + i));
}
}
}
}
}
void gemm_tn(ptrdiff_t M, ptrdiff_t N, ptrdiff_t K,
float ALPHA,
float const *A, ptrdiff_t lda,
float const *B, ptrdiff_t ldb,
float *C, ptrdiff_t ldc,
ptrdiff_t numGroup, ptrdiff_t filtersVolume_step)
{
int i, j, k, g;
for (int g = 0; g<numGroup; ++g)
{
ptrdiff_t filterGrpOffset = M * N * g;
ptrdiff_t tempGrpOffset = K * filtersVolume_step * g;
ptrdiff_t derOutputGrpOffset = K *N *g;
for (j = 0; j < N; ++j){
for (i = 0; i < M; ++i){
register float sum = 0;
for (k = 0; k < K; ++k){
sum += (*(A + tempGrpOffset + i*lda + k)) * (*(B + derOutputGrpOffset + j*ldb + k));
}
*(C + filterGrpOffset + j*ldc + i) += ALPHA*sum;
}
}
}
}
void gemm_nt(ptrdiff_t M, ptrdiff_t N, ptrdiff_t K,
float ALPHA,
float const *A, ptrdiff_t lda,
float const *B, ptrdiff_t ldb,
float *C, ptrdiff_t ldc,
ptrdiff_t numGroup, ptrdiff_t filtersVolume_step)
{
int i, j, k, g;
for (int g = 0; g<numGroup; ++g)
{
ptrdiff_t filterGrpOffset = N * K * g;
ptrdiff_t tempGrpOffset = M * filtersVolume_step *g;
ptrdiff_t derOutputGrpOffset = M * K *g;
for (j = 0; j < N; ++j){
for (k = 0; k < K; ++k){
register float temp = ALPHA*(*(B + filterGrpOffset + k*ldb + j));
for (i = 0; i < M; ++i){
*(C + tempGrpOffset + j*ldc + i) += temp * (*(A + derOutputGrpOffset + k*lda + i));
}
}
}
}
}