设备查询与内核出错处理
实验1:设备查询与内核出错处理
| ||||||||||
|
|
| ||||||||
| ||||||||||
| ||||||||||
1.1查询你机器上GPU设备的参数 l 新建.cu文件 l 调用cudaGetDeviceCount()得到GPU设备的数量 l 调用cudaGetDeviceProperties()函数得到GPU设备的属性结构体 l 解释关键属性的含义,至少包括设备名称、计算能力为多少、设备可用全局内存、每线程块最大线程数、设备可用全局内存容量、每线程块可用共享内存容量、每线程块可用寄存器数量、每线程块最大线程数、每个处理器簇最大驻留线程数、设备中的处理器簇数量等 l 可参考WILT 3.2节
1.2首先自己从头编写并运行VectorSum内核,然后尝试多种查看错误的方式 l 在内核函数内printf信息 l 组合调用cudaGetLastError()和cudaGetErrorString(),返回出错字符串 l 宏 l 在内核启动时,使用非法参数,检验是否成功处理 l 可参考WILT 附录A.3节
1.3 本实验 3学时完成
| ||||||||||
实验内容及结果:(写出完整操作过程) | ||||||||||
1.1查询数据 int main() { int numDevices; cudaGetDeviceCount(&numDevices); printf("Number of Devices:%d\n",numDevices); for (int i =0;i<numDevices;++i) { printf("----------------------\n"); cudaDeviceProp cdp; cudaGetDeviceProperties(&cdp,i); printf("Device Number: %d\n",i); printf("Device Name: %s\n",cdp.name); printf("Compute capability: %d.%d\n",cdp.major,cdp.minor); printf("Maximum threads/block: %d\n",cdp.maxThreadsPerBlock); printf("Shared memory /block: %lu bytes \n",cdp.sharedMemPerBlock); printf("Total global memory : %lu bytes\n",cdp.totalGlobalMem); }
} 1.2VectorSum的完成
#include "cuda_runtime.h" #include "device_launch_parameters.h"
#include <stdio.h> #include<cuda.h>
__global__ void addKernel(int * c, const int *a, const int *b) { int i = threadIdx.x; c[i] = a[i] + b[i];
}
int main() { int numDevices; cudaGetDeviceCount(&numDevices); printf("Number of Devices:%d\n",numDevices); for (int i =0;i<numDevices;++i) { printf("----------------------\n"); cudaDeviceProp cdp; cudaGetDeviceProperties(&cdp,i); printf("Device Number: %d\n",i); printf("Device Name: %s\n",cdp.name); printf("Compute capability: %d.%d\n",cdp.major,cdp.minor); printf("Maximum threads/block: %d\n",cdp.maxThreadsPerBlock); printf("Shared memory /block: %lu bytes \n",cdp.sharedMemPerBlock); printf("Total global memory : %lu bytes\n",cdp.totalGlobalMem); }
const int arraySize = 5; const int a[arraySize] = { 1, 2, 3, 4, 5 }; const int b[arraySize] = { 10, 20, 30, 40, 50 }; int c[arraySize] = { 0 };
CUresult cudaStatus;
int *dev_a, *dev_b, *dev_c;
//Allocate GPU buffere for three vectors (two inputs, one outputs) cudaMalloc((void**)&dev_c,arraySize * sizeof(int)); cudaMalloc((void**)&dev_a,arraySize * sizeof(int)); cudaMalloc((void**)&dev_b,arraySize * sizeof(int));
//Copy input vectors from host memory to GPU buffers cudaMemcpy(dev_a,a, arraySize * sizeof(int),cudaMemcpyHostToDevice); cudaMemcpy(dev_b,b, arraySize * sizeof(int),cudaMemcpyHostToDevice);
//Launch a kernel on the GPU with one thread for each element addKernel<<<1,arraySize>>>(dev_c,dev_a,dev_b);
//Check errors cudaStatus = cudaGetLastError(); if(cudaStatus != cudaSuccess) { fprintf(stderr,"addvector failed: %s\n",cudaGetErrorString(cudaStatus)); fprintf(stderr,"%s\n",cudaGetErrorString(cudaStatus)); goto Error; } //any errors return cudaStatus = cudaDeviceSynchronize(); if(cudaStatus != cudaSuccess) { fprintf(stderr,"cudaDeviceSynchronize returned %d\n",cudaStatus); fprintf(stderr,"%s\n",cudaGetErrorString(cudaStatus)); goto Error; }
//Copy output vector from GPU buffer to host memory cudaMemcpy(c,dev_c,arraySize * sizeof(int),cudaMemcpyDeviceToHost);
cudaFree(dev_c); cudaFree(dev_a); cudaFree(dev_b);
printf("{ 1, 2, 3, 4, 5}\n + \n{10,20,30,40,50} \n = \n{%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]);
getchar();
return 0; }
| ||||||||||
实验过程中遇到的问题如何解决的?本次实验的体会(详细)? | ||||||||||
1.Memcpy函数使用的时候参数较多,第一次少写一个造成问题
2.调试的时候出现大量的warning,函数调用的时候<<<blocks,threads>>>经常出现警示。
3.代码完成后发现输出正常但c[i]数组输出有问题,均为[0]发现并没有将数组的值发生改变。
| ||||||||||
| ||||||||||
|