cuda c++的一些记录(2)

仅作个人记录

参考:CUDA编程(三)评估CUDA程序的表现_MingChao_Sun-CSDN博客
顺便说一句,这位博主的cuda系列写的很清晰,关于环境配置,也建议参考这位博主,简单直接就行。

过程中碰到问题,还是建议查看官方API文档
CUDA Runtime API :: CUDA Toolkit Documentation

#include <iostream>
#include <stdlib.h>
#include <time.h> //用于计时
#include <cuda_runtime.h>

using namespace std;

#define DATA_SIZE 1048576
int data[DATA_SIZE];

//产生随机数
void generateNum(int *data, int size){
	for (int i = 0; i < size; ++i)	data[i] = rand() % 10;
}

void printDeviceProp(const cudaDeviceProp &prop){
	cout << "Device Name: " << prop.name << endl;
	cout << "totalGlobalMem: " << prop.totalGlobalMem << endl;
	cout << "sharedMemPerBlock: " << prop.sharedMemPerBlock << endl;
	cout << "regsPerBlock: " << prop.regsPerBlock << endl;
	cout << "warpSize: " << prop.warpSize << endl;
	cout << "memPitch: " << prop.memPitch << endl;
	cout << "maxThreadsPerBlock:" << prop.maxThreadsPerBlock << endl;
	cout << "maxThreadsDim[0 - 2]: " << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " " << prop.maxThreadsDim[2] << endl;
	cout << "maxGridSize[0 - 2]: " << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " " << prop.maxGridSize[2] << endl;
	cout << "totalConstMem:" << prop.totalConstMem << endl;
	cout << "major.minor:" << prop.major << " " << prop.minor << endl;
	cout << "clockRate:" << prop.clockRate << endl;
	cout << "textureAlignment:" << prop.textureAlignment << endl;
	cout << "deviceOverlap:" << prop.deviceOverlap << endl;
	cout << "multiProcessorCount:" << prop.multiProcessorCount << endl;
}

//cuda初始化
bool InitCuda(){
	int count;
	cudaGetDeviceCount(&count);//获取能够使用的gpu数量,编号从0开始
	if (count == 0)	return false;//没有支持cuda的gpu
	int device = 0;
	for (; device < count; ++device){
		cudaDeviceProp prop;
		if (cudaGetDeviceProperties(&prop, device) == cudaSuccess){
			printDeviceProp(prop);
			break;//寻找一个可用的gpu
		}
	}
	cudaSetDevice(device);//决定使用编号为device的gpu
	return true;
}

//__global__函数(GPU上执行),计算立方和
__global__ void sum_Squares(int *num, int *result, clock_t *time){
	clock_t start = clock();
	int sum = 0;
	for (int i = 0; i < DATA_SIZE; ++i)	sum += num[i] * num[i] * num[i];
	*result = sum;
	*time = clock() - start;
}

int main(){
	if (!InitCuda())	return 0;
	//生成随机数
	generateNum(data, DATA_SIZE);

	int *gpudata, *result;
	clock_t *time;
	//gpu上开内存空间存储数组以及计算结果
	cudaMalloc((void **)&gpudata, sizeof(int)*DATA_SIZE);//第一个参数是指针的指针
	cudaMalloc((void **)&result, sizeof(int));
	cudaMalloc((void **)&time, sizeof(clock_t));

	//数据从cpu搬运到gpu
	cudaMemcpy(gpudata, data, sizeof(int)*DATA_SIZE, cudaMemcpyHostToDevice);

	sum_Squares << <1, 1, 0 >> >(gpudata, result, time);//进行运算

	//运算结果又从gpu搬运回cpu
	int sum;
	clock_t time_cost;
	cudaMemcpy(&sum, result, sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(&time_cost, time, sizeof(clock_t), cudaMemcpyDeviceToHost);

	//释放gpu上面开的内存
	cudaFree(gpudata);
	cudaFree(result);
	cudaFree(time);

	cout << "GPU_sum: " << sum << " time cost: " << time_cost << endl;

	sum = 0;//cpu上面也计算一次进行验证
	for (int i = 0; i < DATA_SIZE; ++i)	sum += data[i] * data[i] * data[i];
	cout << "CPU_sum: " << sum << endl;

	return 0;
}

输出结果:
在这里插入图片描述

耗时:
1971223805 / ( 1620000 ∗ 1000 ) = 1.216 S 1971223805 / (1620000*1000) = 1.216S 1971223805/(16200001000)=1.216S

内存带宽:

DATA_SIZE 为 1048576 = 1024*1024 也就是 1M,1M 个 int(32bits) 数字的数据量是 1M * (32/8) byte= 4MB,内存带宽约为:

4 M B / 1.216 S = 3.287 M B / S 4MB / 1.216S = 3.287MB/S 4MB/1.216S=3.287MB/S

慢的离谱,1050ti的内存带宽在 112 GB/S,因为完全没有并行,后面在慢慢优化。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值