windows7 cuda9.0 GT 630M vs2015
出现同样的代码,使用cuda8.0可以检测到GPU
而 使用cuda9.0不可以。(cudaGetDeviceCount=0) 本来以为是版本冲突,直到看到下面这篇文章!!
https://blog.csdn.net/zhouyiqi_c/article/details/79121469
原来是计算能力的问题,我的GPU计算能力达不到cuda9.0的要求(计算能力3.0)。
测试代码如下:
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
cudaDeviceProp prop;
//cudaSetDevice(1);
int count;
//读取设备的数量
cudaGetDeviceCount(&count);
printf("GPU_count = %d\n\n", count);
for (int i = 0; i<count; i++)
{
//获取相应设备号的设备信息
cudaGetDeviceProperties(&prop, i);
printf(" --- General Information for device %d ---\n", i);
//显卡名称
printf("Name:\t%s\n", prop.name);
//计算能力
printf("Compute capability:\t%d.%d\n", prop.major, prop.minor);
//时钟频率
printf("Clock rate:\t%d\n", prop.clockRate);
//设备复制重叠
printf("Device copy overlap:\t");
if (prop.deviceOverlap)
{
printf("Enable\n");
}
else
{
printf("Disable\n");
}
//内核执行超时
printf("Kernel execition timeout:\t");
if (prop.kernelExecTimeoutEnabled)
{
printf("Enable\n");
}
else
{
printf("Disable\n");
}
printf(" --- Memory Information for device %d ---\n", i);
//设备上全局内存的总量,单位为字节
printf("Total global Mem:\t%ld\n", prop.totalGlobalMem);
//设备上的常量内存的总量,单位为字节
printf("Total constant Mem:\t%ld\n", prop.totalConstMem);
//内存拷贝允许的最大字节间距
printf("Max men pitch:\t%ld\n", prop.memPitch);
//纹理对齐要求
printf("Texture Alignment:\t%ld\n", prop.textureAlignment);
printf(" ---MP Information for device %d ---\n", i);
//设备上多处理器的数量
printf("Multiprocessor count:\t%d\n", prop.multiProcessorCount);
//以字节为单位的每个块可用的共享内存
printf("Shared mem per mp:\t%ld\n", prop.sharedMemPerBlock);
//每个块可用的32位寄存器
printf("Registers per mp:\t%d\n", prop.regsPerBlock);
//每个warp包含的线程数量
printf("Threads in warp:\t%d\n", prop.warpSize);
//每个块包含的最大线程数量
printf("Max threads per block:\t%d\n", prop.maxThreadsPerBlock);
//块的每个维度的最大大小
printf("Max threads dimensions:\t(%d, %d, %d)\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[3]);
//网格的每个维度的最大大小
printf("Max grid dimensions:\t(%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[3]);
printf("\n\n");
}
system("pause");
// Add vectors in parallel.
cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",
c[0], c[1], c[2], c[3], c[4]);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceReset failed!");
return 1;
}
system("pause");
return 0;
}