#Cuda Programming Guide 3.2.3 程序一

最新推荐文章于 2021-01-16 19:17:57 发布

qq_33182722

最新推荐文章于 2021-01-16 19:17:57 发布

阅读量110

点赞数

分类专栏： Cuda编程

本文链接：https://blog.csdn.net/qq_33182722/article/details/82787050

版权

Cuda编程专栏收录该内容

3 篇文章 0 订阅

订阅专栏

#Cuda Programming Guide 3.2.3 程序一
内容：利用非共享内存实现矩阵相乘
#include
#include <stdlib.h>
#include <stdio.h>
#include <malloc.h>
//Host code
/矩阵结构体*/
typedef struct{
int width;
int height;
float *elements;
}Matrix;
//Thread block size
#define BLOCK_SIZE 16

//Forward declaration of the matrix mutiplication kernel
global void MatMulKernel(const Matrix,const Matrix,Matrix);

//Matrix mutiplication - Host code
//Matrix dimensions are assumed to be mutiples of BLOCK_SIZE
void MatMul(const Matrix A,const Matrix B, Matrix C)
{
//Load A and B to device memory
//构建设备上的矩阵d_A
Matrix d_A;
d_A.width = A.width;d_A.height = A.height;
size_t size = A.widthA.heightsizeof(float);
cudaMalloc((void**)&d_A.elements,size);//为矩阵d_A分配内存
//将主机上的矩阵A复制到设备上的矩阵d_A
cudaMemcpy(d_A.elements,A.elements,size,cudaMemcpyHostToDevice);
Matrix d_B;
d_B.width = B.width;d_B.height = B.height;
size = B.width * B.height*sizeof(float);
cudaMalloc((void**)&d_B.elements,size);
cudaMemcpy(d_B.elements,B.elements,size,cudaMemcpyHostToDevice);

//Allocate C in device memory
Matrix d_C;
d_C.width = C.width;d_C.height = C.height;
size = C.width*C.height*sizeof(float);
cudaMalloc((void**)&d_C.elements,size);

//Invoke kernel
dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);//线程块的维度大小
dim3 dimGrid((A.width+BLOCK_SIZE-1)/dimBlock.x,(B.height+BLOCK_SIZE-1)/dimBlock.y);//线程格的维度大小
MatMulKernel<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);
//Read C from device memory
cudaMemcpy(C.elements,d_C.elements,size,cudaMemcpyDeviceToHost);
//Output results
printf("Output results\n");
for(int i = 0;i<C.height;i++)
{
	for(int j = 0;j<C.width;j++)
		printf("%f ",C.elements[i*C.width+j]);
	printf("\n");
}
//Free device memory
cudaFree(d_A.elements);
cudaFree(d_B.elements);
cudaFree(d_C.elements);
//Free host memory
free(A.elements);
free(B.elements);
free(C.elements);

}

int main()
{
Matrix A;//定义矩阵A

A.width = 5; A.height = 3;
A.elements = (float*)malloc(A.width*A.height*sizeof(float));//为矩阵A分配空间
//构建矩阵A
for(int i = 0;i<3;i++)
{
	for(int j = 0;j<5;j++)
	{
		A.elements[i*5+j] = i*j+j;
		printf("%f ",A.elements[i*5+j]);
	}
printf("\n");
}
printf("\n");
/***************************************/

/***************************************/
//构建矩阵B
Matrix B;
B.width = 2; B.height = 5;
B.elements = (float*)malloc(B.width*B.height*sizeof(float));
for(int i = 0;i<5;i++)
{
	for(int j = 0;j<2;j++)
	{
		B.elements[i*2+j] = i*j+j;
		printf("%f ",B.elements[i*2+j]);
	}
	printf("\n");
}
printf("\n");
/***************************************/

/***************************************/
//为矩阵C分配内存
Matrix C;
C.height = A.height;C.width = B.width;
C.elements = (float*)malloc(C.width*C.height*sizeof(float));
/***************************************/

//调用矩阵乘法函数
MatMul(A,B,C);
return 0;

}

//Matrix mutiplication kernel called by MatMul()
global void MatMulKernel(Matrix A,Matrix B,Matrix C)
{
//Each thread computes one element of C
//by accumulating results into Cvalue
float Cvalue = 0;
int row = blockIdx.yblockDim.y + threadIdx.y;//行线程索引,行号
int col = blockIdx.xblockDim.x + threadIdx.x;//列线程索引，列号

for(int e = 0;e<A.width;++e)
	Cvalue += A.elements[row*A.width+e]*B.elements[e*B.width+col];
__syncthreads();//不加同步条件结果会出错
C.elements[row*C.width+col] = Cvalue;

}

qq_33182722

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
#Cuda Programming Guide 3.2.3 程序一

#Cuda Programming Guide 3.2.3 程序一内容：利用非共享内存实现矩阵相乘#include #include &lt;stdlib.h&gt;#include &lt;stdio.h&gt;#include &lt;malloc.h&gt;//Host code/矩阵结构体*/typedef struct{int width;int height;fl...
复制链接

扫一扫