#include <iostream>
#include <cuda_runtime.h>
#include <time.h>
using namespace std;
// 在device上做矩阵加法运算
__global__ void sumArrays(int *a, int *b, int *res, const int size)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < size) {
res[index] = a[index] + b[index];
}
}
void random_ints(int* a, const int size)
{
for (int i = 0; i < size; ++i)
a[i] = rand();
}
int main()
{
int dev = 0;
cudaSetDevice(dev);
cudaEvent_t cuda_start_time, cuda_stop_time; // 定义两个CUDA事件类型的变量
cudaEventCreate(&cuda_start_time); // CUDA事件记时
cudaEventCreate(&cuda_stop_time); // 初始化定义的cudaEvent_t变量
cudaEventRecord(cuda_start_time); // 在需要记时的代码块之前记录代表时间开始的事件
cudaEventQuery(cuda_stop_time); // cudaEventQuery函数在TCC驱动模式的GPU下可省略,但在处于WDDM驱动模式的GPU必须保留
clock_t c_start_time , c_end_time;
c_start_time = clock(); // C语言函数返回记录开始的值
int n = (2048 * 2048);
int threads_per_block = 1024;
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = n * sizeof(int);
cudaMalloc((void**)&d_a, size);
cudaMalloc((void**)&d_b, size);
cudaMalloc((void**)&d_c, size);
a = (int*)malloc(size);
random_ints(a, n);
b = (int*)malloc(size);
random_ints(b, n);
c = (int*)malloc(size);
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
sumArrays<<<(n + threads_per_block - 1)/threads_per_block, threads_per_block>>>(d_a, d_b, d_c, n);
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
for (int i = 0; i < n; ++i) {
cout << c[i] << "\t";
}
cout << endl;
cout << cudaGetErrorString(cudaGetLastError()) << endl;
free(a);
free(b);
free(c);
cudaDeviceSynchronize();
cudaEventRecord(cuda_stop_time);
cudaEventSynchronize(cuda_stop_time); // 等待事件被记录完毕
float cuda_run_time;
cudaEventElapsedTime(&cuda_run_time, cuda_start_time, cuda_stop_time);
printf("\n(Cuda) The Running time is %f\n", cuda_run_time);
cudaEventDestroy(cuda_start_time);
cudaEventDestroy(cuda_stop_time);
c_end_time = clock(); // 记录结束的值
double c_run_time = ((double)(c_end_time - c_start_time)) / CLK_TCK; /* 用结束时间减去开始时间,因为是毫秒单位,所以除以CLK_TCK来转化为秒 */
printf("\n(C) The Running time is %f\n", c_run_time);
return 0;
}
/*
Cuda程序运行流程如下:
准备待处理数据
在device上分配存储空间
把数据从host拷贝到device
执行device运算
把结果从device拷贝回host
释放device空间
释放host空间
*/
CUDA C++ 实现矩阵加法运算,测试GPU性能
于 2023-08-27 21:05:57 首次发布