学习GPU加速 参考例子和网上博文写的GPU矩阵计算
//矩阵转置
__global__ void transposeGPU(double *outdata, double *indata, int width, int height)
{
__shared__ double block[BLOCK_DIM][BLOCK_DIM + 1];
unsigned int xIndex = blockIdx.x*BLOCK_DIM + threadIdx.x;
unsigned int yIndex = blockIdx.y*BLOCK_DIM + threadIdx.y;
if ((xIndex < width) && (yIndex < height))
{
unsigned int index_in = yIndex*width + xIndex;
block[threadIdx.y][threadIdx.x] = indata[index_in];
}
__syncthreads();
xIndex = blockIdx.y*BLOCK_DIM + threadIdx.x;
yIndex = blockIdx.x*BLOCK_DIM + threadIdx.y;
if ((xIndex < height) && (yIndex < width))
{
unsigned int index_out = yIndex*height + xIndex;
outdata[index_out] = block[threadIdx.x][threadIdx.y];
// std::cout <<" !!index_out:" <<index_out<<" block[threadIdx.x][threadIdx.y]:"<< block[threadIdx.x][threadIdx.y] <<std::endl;
}
}
void transpose(double *outdata, double *indata, int width, int height)
{
double *d_a, *d_b;
cudaMalloc((double**)&d_a, sizeof(double) * width * height);
cudaMalloc((double**)&d_b, sizeof(double) * height * width);
cudaMemcpy((void*)d_a, (void*)indata, sizeof(double) * width * height, cudaMemcpyHostToDevice);
dim3 Threads(BLOCK_DIM, BLOCK_DIM);
int b_x = ceil((float)width / (float)BLOCK_DIM);
int b_y = ceil((float)height / (float)BLOCK_DIM);
dim3 Blocks(b_x, b_y);
transposeGPU << <Blocks, Threads >> > (d_b, d_a, width, height);
cudaMemcpy((void*)outdata, (void*)d_b, sizeof(double) * width * height, cudaMemcpyDeviceToHost);
}
//矩阵乘法
template<int BLOCK_SIZE> __global__ void MatrixMulGPU(double *c, const double *a, const double *b, unsigned int WA, unsigned int WB)
{
// Block index
in