HELLO CUDA(VECTOR SUM内核编程）

最新推荐文章于 2023-10-22 20:06:17 发布

Junruiqwertyuiop

最新推荐文章于 2023-10-22 20:06:17 发布

阅读量832

点赞数 4

分类专栏： CUDA 文章标签： cuda 内核

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/junruitian/article/details/63251942

版权

CUDA 专栏收录该内容

3 篇文章 1 订阅

订阅专栏

设备查询与内核出错处理

实验1：设备查询与内核出错处理







1.1查询你机器上GPU设备的参数 l 新建.cu文件 l 调用cudaGetDeviceCount()得到GPU设备的数量 l 调用cudaGetDeviceProperties()函数得到GPU设备的属性结构体 l 解释关键属性的含义，至少包括设备名称、计算能力为多少、设备可用全局内存、每线程块最大线程数、设备可用全局内存容量、每线程块可用共享内存容量、每线程块可用寄存器数量、每线程块最大线程数、每个处理器簇最大驻留线程数、设备中的处理器簇数量等 l 可参考WILT 3.2节 1.2首先自己从头编写并运行VectorSum内核，然后尝试多种查看错误的方式 l 在内核函数内printf信息 l 组合调用cudaGetLastError()和cudaGetErrorString()，返回出错字符串 l 宏 l 在内核启动时，使用非法参数，检验是否成功处理 l 可参考WILT 附录A.3节 1.3 本实验 3学时完成
实验内容及结果：（写出完整操作过程）
1.1查询数据 int main() { int numDevices; cudaGetDeviceCount(&numDevices); printf("Number of Devices:%d\n",numDevices); for (int i =0;i<numDevices;++i) { printf("----------------------\n"); cudaDeviceProp cdp; cudaGetDeviceProperties(&cdp,i); printf("Device Number: %d\n",i); printf("Device Name: %s\n",cdp.name); printf("Compute capability: %d.%d\n",cdp.major,cdp.minor); printf("Maximum threads/block: %d\n",cdp.maxThreadsPerBlock); printf("Shared memory /block: %lu bytes \n",cdp.sharedMemPerBlock); printf("Total global memory : %lu bytes\n",cdp.totalGlobalMem); } } 1.2VectorSum的完成 #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include<cuda.h> __global__ void addKernel(int * c, const int a, const int b) { int i = threadIdx.x; c[i] = a[i] + b[i]; } int main() { int numDevices; cudaGetDeviceCount(&numDevices); printf("Number of Devices:%d\n",numDevices); for (int i =0;i<numDevices;++i) { printf("----------------------\n"); cudaDeviceProp cdp; cudaGetDeviceProperties(&cdp,i); printf("Device Number: %d\n",i); printf("Device Name: %s\n",cdp.name); printf("Compute capability: %d.%d\n",cdp.major,cdp.minor); printf("Maximum threads/block: %d\n",cdp.maxThreadsPerBlock); printf("Shared memory /block: %lu bytes \n",cdp.sharedMemPerBlock); printf("Total global memory : %lu bytes\n",cdp.totalGlobalMem); } const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 }; CUresult cudaStatus; int dev_a, dev_b, dev_c; //Allocate GPU buffere for three vectors (two inputs, one outputs) cudaMalloc((void)&dev_c,arraySize sizeof(int)); cudaMalloc((void*)&dev_a,arraySize sizeof(int)); cudaMalloc((void*)&dev_b,arraySize sizeof(int)); //Copy input vectors from host memory to GPU buffers cudaMemcpy(dev_a,a, arraySize * sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(dev_b,b, arraySize * sizeof(int),cudaMemcpyHostToDevice); //Launch a kernel on the GPU with one thread for each element addKernel<<<1,arraySize>>>(dev_c,dev_a,dev_b); //Check errors cudaStatus = cudaGetLastError(); if(cudaStatus != cudaSuccess) { fprintf(stderr,"addvector failed: %s\n",cudaGetErrorString(cudaStatus)); fprintf(stderr,"%s\n",cudaGetErrorString(cudaStatus)); goto Error; } //any errors return cudaStatus = cudaDeviceSynchronize(); if(cudaStatus != cudaSuccess) { fprintf(stderr,"cudaDeviceSynchronize returned %d\n",cudaStatus); fprintf(stderr,"%s\n",cudaGetErrorString(cudaStatus)); goto Error; } //Copy output vector from GPU buffer to host memory cudaMemcpy(c,dev_c,arraySize * sizeof(int),cudaMemcpyDeviceToHost); cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b); printf("{ 1, 2, 3, 4, 5}\n + \n{10,20,30,40,50} \n = \n{%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]); getchar(); return 0; }
实验过程中遇到的问题如何解决的？本次实验的体会（详细）？
1.Memcpy函数使用的时候参数较多，第一次少写一个造成问题 2.调试的时候出现大量的warning，函数调用的时候<<<blocks,threads>>>经常出现警示。 3.代码完成后发现输出正常但c[i]数组输出有问题，均为[0]发现并没有将数组的值发生改变。

Junruiqwertyuiop

关注

4
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
HELLO CUDA(VECTOR SUM内核编程）

设备查询与内核出错处理实验1：设备查询与内核出错处理 1.1查询你机器上GPU设备的参数l 新建.cu文件l 调用cu
复制链接

扫一扫

专栏目录

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。