#Cuda Programming Guide 3.2.3 程序一
内容:利用非共享内存实现矩阵相乘
#include
#include <stdlib.h>
#include <stdio.h>
#include <malloc.h>
//Host code
/矩阵结构体*/
typedef struct{
int width;
int height;
float *elements;
}Matrix;
//Thread block size
#define BLOCK_SIZE 16
//Forward declaration of the matrix mutiplication kernel
global void MatMulKernel(const Matrix,const Matrix,Matrix);
//Matrix mutiplication - Host code
//Matrix dimensions are assumed to be mutiples of BLOCK_SIZE
void MatMul(const Matrix A,const Matrix B, Matrix C)
{
//Load A and B to device memory
//构建设备上的矩阵d_A
Matrix d_A;
d_A.width = A.width;d_A.height = A.height;
size_t size = A.widthA.heightsizeof(float);
cudaMalloc((void**)&d_A.elements,size);//为矩阵d_A分配内存
//将主机上的矩阵A复制到设备上的矩阵d_A
cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = B.width;d_B.height = B.height;
size = B.width * B.height*sizeof(float);
cudaMalloc((void**)&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);
//Allocate C in device memory
Matrix d_C;
d_C.width = C.width;d_C.height = C.height;
size = C.width*C.height*sizeof(float);
cudaMalloc((void**)&d_C.elements,size);
//Invoke kernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);//线程块的维度大小
dim3 dimGrid((A.width+BLOCK_SIZE-1)/dimBlock.x,(B.height+BLOCK_SIZE-1)/dimBlock.y);//线程格的维度大小
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//Read C from device memory
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//Output results
printf("Output results\n");
for(int i = 0;i<C.height;i++)
{
for(int j = 0;j<C.width;j++)
printf("%f ",C.elements[i*C.width+j]);
printf("\n");
}
//Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
//Free host memory
free(A.elements);
free(B.elements);
free(C.elements);
}
int main()
{
Matrix A;//定义矩阵A
A.width = 5; A.height = 3;
A.elements = (float*)malloc(A.width*A.height*sizeof(float));//为矩阵A分配空间
//构建矩阵A
for(int i = 0;i<3;i++)
{
for(int j = 0;j<5;j++)
{
A.elements[i*5+j] = i*j+j;
printf("%f ",A.elements[i*5+j]);
}
printf("\n");
}
printf("\n");
/***************************************/
/***************************************/
//构建矩阵B
Matrix B;
B.width = 2; B.height = 5;
B.elements = (float*)malloc(B.width*B.height*sizeof(float));
for(int i = 0;i<5;i++)
{
for(int j = 0;j<2;j++)
{
B.elements[i*2+j] = i*j+j;
printf("%f ",B.elements[i*2+j]);
}
printf("\n");
}
printf("\n");
/***************************************/
/***************************************/
//为矩阵C分配内存
Matrix C;
C.height = A.height;C.width = B.width;
C.elements = (float*)malloc(C.width*C.height*sizeof(float));
/***************************************/
//调用矩阵乘法函数
MatMul(A,B,C);
return 0;
}
//Matrix mutiplication kernel called by MatMul()
global void MatMulKernel(Matrix A,Matrix B,Matrix C)
{
//Each thread computes one element of C
//by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.yblockDim.y + threadIdx.y;//行线程索引,行号
int col = blockIdx.xblockDim.x + threadIdx.x;//列线程索引,列号
for(int e = 0;e<A.width;++e)
Cvalue += A.elements[row*A.width+e]*B.elements[e*B.width+col];
__syncthreads();//不加同步条件结果会出错
C.elements[row*C.width+col] = Cvalue;
}