cuda编程(一)矩阵加法


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <chrono>
/*
*1_check_dimension
*/
#include <cuda_runtime.h>
#include <stdio.h>
#include <iostream>

using namespace std;


//定义CHECK的目的是为了方便检查返回是否正确
#define CHECK(call)\
{\
  const cudaError_t error=call;\
  if(error!=cudaSuccess)\
  {\
      printf("ERROR: %s:%d,",__FILE__,__LINE__);\
      printf("code:%d,reason:%s\n",error,cudaGetErrorString(error));\
      exit(1);\
  }\
}

void initDevice(int devNum)
{
	int dev = devNum;
	cudaDeviceProp deviceProp;
	CHECK(cudaGetDeviceProperties(&deviceProp, dev));
	printf("Using device %d: %s\n", dev, deviceProp.name);
	CHECK(cudaSetDevice(dev));

}

void initialData(float* ip, int size)
{
	time_t t;
	srand((unsigned)time(&t));
	for (int i = 0; i < size; i++)
	{
		ip[i] = (float)(rand() & 0xffff) / 1000.0f;
	}
}

void checkResult(float* hostRef, float* gpuRef, const int N)
{
	double epsilon = 1.0E-8;
	for (int i = 0; i < N; i++)
	{
		if (abs(hostRef[i] - gpuRef[i]) > epsilon)
		{
			printf("Results don\'t match!\n");
			printf("%f(hostRef[%d] )!= %f(gpuRef[%d])\n", hostRef[i], i, gpuRef[i], i);
			return;
		}
	}
	printf("Check result success!\n");
}


//CPU对照组,用于对比加速比
void sumMatrix2DonCPU(float* MatA, float* MatB, float* MatC, int nx, int ny)
{
	float* a = MatA;
	float* b = MatB;
	float* c = MatC;
	for (int j = 0; j < ny; j++)
	{
		for (int i = 0; i < nx; i++)
		{
			c[i] = a[i] + b[i];
		}
		c += nx;
		b += nx;
		a += nx;
	}
}




__global__ void sumMatrix(float* MatA, float* MatB, float* MatC, int nx, int ny)
{
	int ix = threadIdx.x + blockDim.x * blockIdx.x;
	int iy = threadIdx.y + blockDim.y * blockIdx.y;
	int idx = ix + iy * ny;
	if (ix < nx && iy < ny)
	{
		MatC[idx] = MatA[idx] + MatB[idx];
	}
}
int main(int argc, char** argv)
{


	//设备初始化
	printf("strating...\n");
	initDevice(0);

	//输入二维矩阵,4096*4096,单精度浮点型。
	int nx = 1 << 12;
	int ny = 1 << 12;
	int nBytes = nx * ny * sizeof(float);

	//Malloc,开辟主机内存
	float* A_host = (float*)malloc(nBytes);
	float* B_host = (float*)malloc(nBytes);
	float* C_host = (float*)malloc(nBytes);
	float* C_from_gpu = (float*)malloc(nBytes);
	initialData(A_host, nx * ny);
	initialData(B_host, nx * ny);

	//cudaMalloc,开辟设备内存
	float* A_dev = NULL;
	float* B_dev = NULL;
	float* C_dev = NULL;
	CHECK(cudaMalloc((void**)&A_dev, nBytes));
	CHECK(cudaMalloc((void**)&B_dev, nBytes));
	CHECK(cudaMalloc((void**)&C_dev, nBytes));

	//输入数据从主机内存拷贝到设备内存
	CHECK(cudaMemcpy(A_dev, A_host, nBytes, cudaMemcpyHostToDevice));
	CHECK(cudaMemcpy(B_dev, B_host, nBytes, cudaMemcpyHostToDevice));


	dim3 threadsPerBlock(32, 32);
	cout << "threadsPerBlock.x = " << threadsPerBlock.x << endl;
	cout << "threadsPerBlock.y = " << threadsPerBlock.y << endl;

	dim3 numBlocks((nx-1) / threadsPerBlock.x +1 , (ny-1) / threadsPerBlock.y + 1);
	cout << "numBlocks.x = " << numBlocks.x << "   numBlocks.y=" << numBlocks.y << endl;

	//测试GPU执行时间
	//double gpuStart = cpuSecond();
	//将核函数放在线程网格中执行
	auto beforeTime = std::chrono::steady_clock::now();
	sumMatrix << <numBlocks, threadsPerBlock >> > (A_dev, B_dev, C_dev, nx, ny);
	auto afterTime = std::chrono::steady_clock::now();
	double duration_millsecond = std::chrono::duration<double, std::milli>(afterTime - beforeTime).count();
	CHECK(cudaDeviceSynchronize());

	printf("GPU Execution Time: %f ms\n", duration_millsecond);

	//在CPU上完成相同的任务
	cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost);
	beforeTime = std::chrono::steady_clock::now();
	sumMatrix2DonCPU(A_host, B_host, C_host, nx, ny);
	afterTime = std::chrono::steady_clock::now();
	duration_millsecond = std::chrono::duration<double, std::milli>(afterTime - beforeTime).count();

	printf("CPU Execution Time: %f ms\n", duration_millsecond);

	//检查GPU与CPU计算结果是否相同
	CHECK(cudaMemcpy(C_from_gpu, C_dev, nBytes, cudaMemcpyDeviceToHost));
	checkResult(C_host, C_from_gpu, nx * ny);

	//释放内存
	cudaFree(A_dev);
	cudaFree(B_dev);
	cudaFree(C_dev);
	free(A_host);
	free(B_host);
	free(C_host);
	free(C_from_gpu);
	cudaDeviceReset();

	
	return 0;
}

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值