这几天研究了
一下CUDA,发现其并行的思想和普通的CPU多线程思想不太一致,但还是挺不错。主要是将任务划分成一个个block,然后每个block里面再划分成细的线程。然后每个线程做自己做的
事情。这种并行思想很适用于像矩阵运算这些元素与元素之间的运算并不耦合得很厉害,但整体数据很大的情况,这只是我对CUDA的初步感觉。
矩阵相乘的CPU程序如下:
矩阵相乘的CPU程序如下:
//
C = A*B
void MatrixMulCPU( float * _C, const float * _A, const float * _B, int _wa, int _ha, int _wb)
{
float sum = 0 ;
for ( int i = 0 ; i < _ha; ++ i)
{
for ( int j = 0 ; j < _wb; ++ j)
{
sum = 0 ;
for ( int k = 0 ; k < _wa; ++ k)
{
sum += ( float )_A[i * _wa + k] * ( float )_B[k * _wb + j];
}
_C[i * _wb + j] = ( float )sum;
}
}
}
void MatrixMulCPU( float * _C, const float * _A, const float * _B, int _wa, int _ha, int _wb)
{
float sum = 0 ;
for ( int i = 0 ; i < _ha; ++ i)
{
for ( int j = 0 ; j < _wb; ++ j)
{
sum = 0 ;
for ( int k = 0 ; k < _wa; ++ k)
{
sum += ( float )_A[i * _wa + k] * ( float )_B[k * _wb + j];
}
_C[i * _wb + j] = ( float )sum;
}
}
}
从上面可以看出,C(i,j) = sum { A(i,k)*B(k,j) } 0<=k < _wa;耦合程度很小,所以我们可以通过划分区域的方法,让每个线程负责一个区域。
怎么划分呢?首先最初的想法是让每一个线程计算一个C(i,j),那么估算一下,应该需要height_c*width_c,也就是ha*wb个线程。进一步,我们将矩阵按一个大方格Grid划分,如果一个
方格Grid大小是16*16,那么矩阵80*48的可以表示为5(*16) * 3(*16),即16*16个大格子(block),每一个格子内,自然就是(height_c/16) *(width_c/16)个线程了。
好了,划分完后,内核代码如下:
计算版本0:
怎么划分呢?首先最初的想法是让每一个线程计算一个C(i,j),那么估算一下,应该需要height_c*width_c,也就是ha*wb个线程。进一步,我们将矩阵按一个大方格Grid划分,如果一个
方格Grid大小是16*16,那么矩阵80*48的可以表示为5(*16) * 3(*16),即16*16个大格子(block),每一个格子内,自然就是(height_c/16) *(width_c/16)个线程了。
好了,划分完后,内核代码如下:
__global__
void
matrix_kernel_0(
float
*
_C,
const
float
*
_A,
const
float
*
_B,
int
_wa,
int
_wb)
{
float sum = 0 ;
// 找出该线程所在的行列
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// 线程Thread(row,col)负责计算C(row,col)
for ( int i = 0 ; i < _wa; ++ i)
{
sum += _A[row * _wa + i] * _B[i * _wb + col];
}
_C[row * _wb + col] = sum;
}
{
float sum = 0 ;
// 找出该线程所在的行列
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
// 线程Thread(row,col)负责计算C(row,col)
for ( int i = 0 ; i < _wa; ++ i)
{
sum += _A[row * _wa + i] * _B[i * _wb + col];
}
_C[row * _wb + col] = sum;
}
另外一种思路,我们不让每一个线程完整计算一个C(i,j),通过C(i,j) = sum { A(i,k)*B(k,j) }发现,我们还可以再细度划分:
Csub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)} 0<=ksub < blockSize
C(i,j) = sum{Csub(i,j)}
就是把矩阵分成n*n个大的子块,然后每一个block负责计算子块i 和 子块j的子乘积,计算完毕后加起来则可。这里主要使用了共享显存作优化。
Csub(i,j) = sum{A(i,ksub+offsetA)*B(ksub+offsetB,j)} 0<=ksub < blockSize
C(i,j) = sum{Csub(i,j)}
就是把矩阵分成n*n个大的子块,然后每一个block负责计算子块i 和 子块j的子乘积,计算完毕后加起来则可。这里主要使用了共享显存作优化。
计算版本1:
__global__
void
matrix_kernel_1(
float
*
_C,
const
float
*
_A,
const
float
*
_B,
int
_wa,
int
_wb)
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
// 该block要处理的A
int aBegin = _wa * (by * BLOCK_SIZE); // A(0,by)
int aEnd = aBegin + _wa - 1 ;
int aStep = BLOCK_SIZE; // offsetA
int bBegin = BLOCK_SIZE * bx; // B(bx,0)
int bStep = BLOCK_SIZE * _wb; // offsetB
float cSub = 0 ;
for ( int a = aBegin,b = bBegin; a <= aEnd; a += aStep,b += bStep)
{
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// 每个线程负责一个元素拷贝
As[ty][tx] = _A[a + _wa * ty + tx];
Bs[ty][tx] = _B[b + _wb * ty + tx];
__syncthreads();
// 每个线程负责计算一个子块i 和 子块j的子乘积
for ( int k = 0 ; k < BLOCK_SIZE; ++ k)
{
cSub += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
// 全局地址,向全局寄存器写回去
// 一个线程负责一个元素,一个block负责一个子块
int cIndex = (by * BLOCK_SIZE + ty) * _wb + (bx * BLOCK_SIZE + tx);
_C[cIndex] = cSub;
}
{
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
// 该block要处理的A
int aBegin = _wa * (by * BLOCK_SIZE); // A(0,by)
int aEnd = aBegin + _wa - 1 ;
int aStep = BLOCK_SIZE; // offsetA
int bBegin = BLOCK_SIZE * bx; // B(bx,0)
int bStep = BLOCK_SIZE * _wb; // offsetB
float cSub = 0 ;
for ( int a = aBegin,b = bBegin; a <= aEnd; a += aStep,b += bStep)
{
__shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
// 每个线程负责一个元素拷贝
As[ty][tx] = _A[a + _wa * ty + tx];
Bs[ty][tx] = _B[b + _wb * ty + tx];
__syncthreads();
// 每个线程负责计算一个子块i 和 子块j的子乘积
for ( int k = 0 ; k < BLOCK_SIZE; ++ k)
{
cSub += As[ty][k] * Bs[k][tx];
}
__syncthreads();
}
// 全局地址,向全局寄存器写回去
// 一个线程负责一个元素,一个block负责一个子块
int cIndex = (by * BLOCK_SIZE + ty) * _wb + (bx * BLOCK_SIZE + tx);
_C[cIndex] = cSub;
}
最后写一个面向Host的接口函数:
void
matrixMulGPU(
float
*
_C,
const
float
*
_A,
const
float
*
_B,
int
_wa,
int
_ha,
int
_wb)
{
float * d_a = myNewOnGPU < float > (_wa * _ha);
float * d_b = myNewOnGPU < float > (_wb * _wa);
float * d_c = myNewOnGPU < float > (_wb * _ha);
copyFromCPUToGPU(_A,d_a,_wa * _ha);
copyFromCPUToGPU(_B,d_b,_wb * _wa);
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
dim3 blocks(WC / BLOCK_SIZE,HC / BLOCK_SIZE);
matrix_kernel_0 <<< blocks,threads >>> (d_c,d_a,d_b,_wa,_wb);
cudaThreadSynchronize();
copyFromGPUToCPU(d_c,_C,_wb * _ha);
myDeleteOnGPU(d_a);
myDeleteOnGPU(d_b);
myDeleteOnGPU(d_c);
}
{
float * d_a = myNewOnGPU < float > (_wa * _ha);
float * d_b = myNewOnGPU < float > (_wb * _wa);
float * d_c = myNewOnGPU < float > (_wb * _ha);
copyFromCPUToGPU(_A,d_a,_wa * _ha);
copyFromCPUToGPU(_B,d_b,_wb * _wa);
dim3 threads(BLOCK_SIZE,BLOCK_SIZE);
dim3 blocks(WC / BLOCK_SIZE,HC / BLOCK_SIZE);
matrix_kernel_0 <<< blocks,threads >>> (d_c,d_a,d_b,_wa,_wb);
cudaThreadSynchronize();
copyFromGPUToCPU(d_c,_C,_wb * _ha);
myDeleteOnGPU(d_a);
myDeleteOnGPU(d_b);
myDeleteOnGPU(d_c);
}
调用的主函数如下:
#include
<
stdio.h
>
#include < cuda_runtime.h >
#include < cutil.h >
#include < cutil_inline.h >
#include < stdlib.h >
#include < time.h >
#include < math.h >
#include < string .h >
#include < Windows.h >
#include " CUDACommon.h "
#include " MatrixMulCPU.h "
#include " MatrixMulGPU.h "
void randomInit( float * _data, int _size)
{
for ( int i = 0 ; i < _size; ++ i)
{
_data[i] = rand() / ( float )RAND_MAX;
}
}
bool checkError( const float * _A, const float * _B, int _size)
{
for ( int i = 0 ; i < _size; ++ i)
{
if (fabs(_A[i] - _B[i]) > 1.0e-3 )
{
printf( " %f \t %f\n " ,_A[i],_B[i]);
return false ;
}
}
return true ;
}
int main( int argc, char * argv[])
{
srand( 13 );
if ( ! InitCUDA()) {
return 0 ;
}
float * A = myNewOnCPU < float > (WA * HA);
float * B = myNewOnCPU < float > (WB * HB);
randomInit(A,WA * HA);
randomInit(B,WB * HB);
float * C = myNewOnCPU < float > (WC * HC);
memset(C, 0 , sizeof ( float ) * WC * HC);
float * C2 = myNewOnCPU < float > (WC * HC);
memset(C2, 0 , sizeof ( float ) * WC * HC);
unsigned int tick1 = GetTickCount();
MatrixMulCPU(C2,A,B,WA,HA,WB);
printf( " CPU use Time : %dms\n " ,GetTickCount() - tick1);
unsigned int timer = 0 ;
cutilCheckError(cutCreateTimer( & timer));
cutilCheckError(cutStartTimer(timer));
{
matrixMulGPU(C,A,B,WA,HA,WB);
}
cutilCheckError(cutStopTimer(timer));
printf( " GPU use time: %f (ms) \n " , cutGetTimerValue(timer));
cutilCheckError(cutDeleteTimer(timer));
if (checkError(C,C2,WC * HC))
{
printf( " Accept\n " );
}
else
{
printf( " Worng Answer\n " );
}
myDeleteOnCPU(A);
myDeleteOnCPU(B);
myDeleteOnCPU(C);
myDeleteOnCPU(C2);
return 0 ;
}
#include < cuda_runtime.h >
#include < cutil.h >
#include < cutil_inline.h >
#include < stdlib.h >
#include < time.h >
#include < math.h >
#include < string .h >
#include < Windows.h >
#include " CUDACommon.h "
#include " MatrixMulCPU.h "
#include " MatrixMulGPU.h "
void randomInit( float * _data, int _size)
{
for ( int i = 0 ; i < _size; ++ i)
{
_data[i] = rand() / ( float )RAND_MAX;
}
}
bool checkError( const float * _A, const float * _B, int _size)
{
for ( int i = 0 ; i < _size; ++ i)
{
if (fabs(_A[i] - _B[i]) > 1.0e-3 )
{
printf( " %f \t %f\n " ,_A[i],_B[i]);
return false ;
}
}
return true ;
}
int main( int argc, char * argv[])
{
srand( 13 );
if ( ! InitCUDA()) {
return 0 ;
}
float * A = myNewOnCPU < float > (WA * HA);
float * B = myNewOnCPU < float > (WB * HB);
randomInit(A,WA * HA);
randomInit(B,WB * HB);
float * C = myNewOnCPU < float > (WC * HC);
memset(C, 0 , sizeof ( float ) * WC * HC);
float * C2 = myNewOnCPU < float > (WC * HC);
memset(C2, 0 , sizeof ( float ) * WC * HC);
unsigned int tick1 = GetTickCount();
MatrixMulCPU(C2,A,B,WA,HA,WB);
printf( " CPU use Time : %dms\n " ,GetTickCount() - tick1);
unsigned int timer = 0 ;
cutilCheckError(cutCreateTimer( & timer));
cutilCheckError(cutStartTimer(timer));
{
matrixMulGPU(C,A,B,WA,HA,WB);
}
cutilCheckError(cutStopTimer(timer));
printf( " GPU use time: %f (ms) \n " , cutGetTimerValue(timer));
cutilCheckError(cutDeleteTimer(timer));
if (checkError(C,C2,WC * HC))
{
printf( " Accept\n " );
}
else
{
printf( " Worng Answer\n " );
}
myDeleteOnCPU(A);
myDeleteOnCPU(B);
myDeleteOnCPU(C);
myDeleteOnCPU(C2);
return 0 ;
}
运算结果如下:
版本0:
版本1:
可以看出,GPU并行性能比CPU好很多,而且版本1优于版本0