#Cuda Programming Guide 3.2.3 程序一

#Cuda Programming Guide 3.2.3 程序一
内容:利用非共享内存实现矩阵相乘
#include
#include <stdlib.h>
#include <stdio.h>
#include <malloc.h>
//Host code
/矩阵结构体*/
typedef struct{
int width;
int height;
float *elements;
}Matrix;
//Thread block size
#define BLOCK_SIZE 16

//Forward declaration of the matrix mutiplication kernel
global void MatMulKernel(const Matrix,const Matrix,Matrix);

//Matrix mutiplication - Host code
//Matrix dimensions are assumed to be mutiples of BLOCK_SIZE
void MatMul(const Matrix A,const Matrix B, Matrix C)
{
//Load A and B to device memory
//构建设备上的矩阵d_A
Matrix d_A;
d_A.width = A.width;d_A.height = A.height;
size_t size = A.widthA.heightsizeof(float);
cudaMalloc((void**)&d_A.elements,size);//为矩阵d_A分配内存
//将主机上的矩阵A复制到设备上的矩阵d_A
cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = B.width;d_B.height = B.height;
size = B.width * B.height*sizeof(float);
cudaMalloc((void**)&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);

//Allocate C in device memory
Matrix d_C;
d_C.width = C.width;d_C.height = C.height;
size = C.width*C.height*sizeof(float);
cudaMalloc((void**)&d_C.elements,size);

//Invoke kernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);//线程块的维度大小
dim3 dimGrid((A.width+BLOCK_SIZE-1)/dimBlock.x,(B.height+BLOCK_SIZE-1)/dimBlock.y);//线程格的维度大小
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//Read C from device memory
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//Output results
printf("Output results\n");
for(int i = 0;i<C.height;i++)
{
	for(int j = 0;j<C.width;j++)
		printf("%f ",C.elements[i*C.width+j]);
	printf("\n");
}
//Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
//Free host memory
free(A.elements);
free(B.elements);
free(C.elements);

}

int main()
{
Matrix A;//定义矩阵A

A.width = 5; A.height = 3;
A.elements = (float*)malloc(A.width*A.height*sizeof(float));//为矩阵A分配空间
//构建矩阵A
for(int i = 0;i<3;i++)
{
	for(int j = 0;j<5;j++)
	{
		A.elements[i*5+j] = i*j+j;
		printf("%f ",A.elements[i*5+j]);
	}
printf("\n");
}
printf("\n");
/***************************************/

/***************************************/
//构建矩阵B
Matrix B;
B.width = 2; B.height = 5;
B.elements = (float*)malloc(B.width*B.height*sizeof(float));
for(int i = 0;i<5;i++)
{
	for(int j = 0;j<2;j++)
	{
		B.elements[i*2+j] = i*j+j;
		printf("%f ",B.elements[i*2+j]);
	}
	printf("\n");
}
printf("\n");
/***************************************/

/***************************************/
//为矩阵C分配内存
Matrix C;
C.height = A.height;C.width = B.width;
C.elements = (float*)malloc(C.width*C.height*sizeof(float));
/***************************************/

//调用矩阵乘法函数
MatMul(A,B,C);
return 0;

}

//Matrix mutiplication kernel called by MatMul()
global void MatMulKernel(Matrix A,Matrix B,Matrix C)
{
//Each thread computes one element of C
//by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.yblockDim.y + threadIdx.y;//行线程索引,行号
int col = blockIdx.x
blockDim.x + threadIdx.x;//列线程索引,列号

for(int e = 0;e<A.width;++e)
	Cvalue += A.elements[row*A.width+e]*B.elements[e*B.width+col];
__syncthreads();//不加同步条件结果会出错
C.elements[row*C.width+col] = Cvalue;

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值