windows判断GPU及是否支持CUDA
在安装有英伟达显示的机器上安装NVIDIA GPU Computing Toolkit
,版本11+
。在安装了NVToolkit电脑里找到nvcuda.dll
(通常下该文件位于C:\windows\SysWOW64
目录下)、cuda.lib
(位于NVToolkit\CUDA\{Version}\lib
目录下)和Cuda开发头文件。
以下代码为列表系统里的Cuda
设备,没有的话cuDeviceGetCount()
会返回相应的错误码,错误码CUDA_ERROR_NOT_INITIALIZED
即是没有设备
。
#include <iostream>
#include <cuda.h>
#pragma comment(lib, "cuda.lib")
int main()
{
uint32_t flag = 0;
auto ret = cuInit(flag); // 没有装驱动的机器会抛异常
int count = 0;
ret = cuDeviceGetCount(&count);
if (ret != CUDA_SUCCESS) {
std::cout << "No GPU device, ret:" << ret << std::endl;
}
else {
std::cout << "device count:" << count << ", ret=" << ret << std::endl;
}
std::cout << "Hello World!\n";
}
获取GPU Cuda Core核心数量
#ifdef _WIN32
#include <windows.h>
static int getCpuCount(int& processorCoreCount, int& logicalProcessorCount)
{
typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
// Helper function to count set bits in the processor mask.
auto CountSetBits = [](ULONG_PTR bitMask) -> DWORD
{
DWORD LSHIFT = sizeof(ULONG_PTR) * 8 - 1;
DWORD bitSetCount = 0;
ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
DWORD i;
for (i = 0; i <= LSHIFT; ++i)
{
bitSetCount += ((bitMask & bitTest) ? 1 : 0);
bitTest /= 2;
}
return bitSetCount;
};
LPFN_GLPI glpi;
BOOL done = FALSE;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
DWORD returnLength = 0;
DWORD byteOffset = 0;
auto handle = GetModuleHandle(TEXT("kernel32"));
if (handle == nullptr) {
printf("get kernel32 handle failed.\n");
return -1;
}
glpi = (LPFN_GLPI)GetProcAddress(handle, "GetLogicalProcessorInformation");
if (NULL == glpi)
{
printf("GetLogicalProcessorInformation is not supported.\n");
return -2;
}
DWORD rc = glpi(nullptr, &returnLength);
buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength);
rc = glpi(buffer, &returnLength);
if (buffer == nullptr) {
printf("GetLogicalProcessorInformation() failed\n");
return -3;
}
ptr = buffer;
while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength)
{
switch (ptr->Relationship)
{
case RelationProcessorCore:
processorCoreCount++;
// A hyperthreaded core supplies more than one logical processor.
logicalProcessorCount += CountSetBits(ptr->ProcessorMask);
break;
default:
break;
}
byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
ptr++;
}
free(buffer);
return 0;
}
#else
#include <thread>
int getCpuCount(int& processorCoreCount, int& logicalProcessorCount)
{
processorCoreCount = logicalProcessorCount = std::thread::hardware_concurrency();
return 0;
}
#endif
//得到内存总大小(单位MB)
// 下载时获取总物理内存
// 运行时获取可用物理内存
static ULONG GetTotalMemorySize(bool isConstPhys)
{
#ifdef _WIN32
MEMORYSTATUSEX statex;
statex.dwLength = sizeof(statex);
GlobalMemoryStatusEx(&statex);
return (ULONG)((isConstPhys ? statex.ullTotalPhys : statex.ullAvailPhys) / (1024 * 1024));
#else
return 0;
#endif
}
#include <cuda_runtime.h>
#ifdef _WIN32
#pragma comment(lib, "cudart_static.lib")
#endif
// 函数来自官方:https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h
// Beginning of GPU Architecture definitions
static int _ConvertSMVer2Cores(int major, int minor) {
// Defines for GPU Architecture types (using the SM version to determine
// the # of cores per SM
typedef struct {
int SM; // 0xMm (hexidecimal notation), M = SM Major version,
// and m = SM minor version
int Cores;
} sSMtoCores;
sSMtoCores nGpuArchCoresPerSM[] = {
{0x30, 192},
{0x32, 192},
{0x35, 192},
{0x37, 192},
{0x50, 128},
{0x52, 128},
{0x53, 128},
{0x60, 64},
{0x61, 128},
{0x62, 128},
{0x70, 64},
{0x72, 64},
{0x75, 64},
{0x80, 64},
{0x86, 128},
{0x87, 128},
{0x89, 128},
{0x90, 128},
{-1, -1} };
int index = 0;
while (nGpuArchCoresPerSM[index].SM != -1) {
if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
return nGpuArchCoresPerSM[index].Cores;
}
index++;
}
return nGpuArchCoresPerSM[index - 1].Cores;
}
#include <string>
std::string format_size(size_t size)
{
if (size > (1000 * 1000 * 1000)) {
return std::to_string(size / (1000 * 1000 * 1000)) + " GB";
}
else if (size > (1000 * 1000)) {
return std::to_string(size / (1000 * 1000)) + " MB";
}
else if (size > (1000)) {
return std::to_string(size / (1000)) + " KB";
}
else {
return std::to_string(size) + " bytes";
}
}
bool checkHardwareCond(bool isDownload = true)
{
int selDevIndex = -1;
try {
int cuCount = 0;
cudaError_t curet = cudaGetDeviceCount(&cuCount);
if (curet != cudaSuccess) {
printf("No GPU device, ret:%s\n", cudaGetErrorString(curet));
}
else {
printf("gpu device count:%d\n", cuCount);
size_t minMemSize = ((size_t)3 * 1000 * 1000 * 1000);
std::string selName;
int selDevCoresNumber = 0;
for (int devIndex = 0; devIndex < cuCount; devIndex++) {
curet = cudaSetDevice(devIndex);
cudaDeviceProp deviceProp;
curet = cudaGetDeviceProperties(&deviceProp, devIndex);
int coreNumber = deviceProp.multiProcessorCount * _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);//deviceProp.warpSize;
size_t freeMem = 0;
size_t totMem = 0;
curet = cudaMemGetInfo(&freeMem, &totMem);
printf("gpu device:%d, cuda cores:%d, multiProcessorCount:%d, concurrentKernels:%d"
", maxThreadsPerMultiProcessor:%d, regsPerBlock:%d, warpSize:%d, clockRate:%d, major:%d"
", minor:%d, asyncEngineCount:%d, maxBlocksPerMultiProcessor:%d, totalGlobalMem:%s"
", totalConstMem:%s, freeMem:%s, totMem:%s, name:%s\n",
devIndex, coreNumber, deviceProp.multiProcessorCount, deviceProp.concurrentKernels,
deviceProp.maxThreadsPerMultiProcessor, deviceProp.regsPerBlock, deviceProp.warpSize,
deviceProp.clockRate, deviceProp.major, deviceProp.minor, deviceProp.asyncEngineCount,
deviceProp.maxBlocksPerMultiProcessor, format_size(deviceProp.totalGlobalMem).c_str(), format_size(deviceProp.totalConstMem).c_str(),
format_size(freeMem).c_str(), format_size(totMem).c_str(), deviceProp.name);
// 显存总大小大于 3GB
if (selDevCoresNumber < coreNumber && deviceProp.totalGlobalMem >= minMemSize) {
// 下载时不判断可用显存
if (isDownload) {
selDevCoresNumber = coreNumber;
selDevIndex = devIndex;
selName = deviceProp.name;
}
else {
// 运行时判断可用显存 >= 3GB
if (freeMem >= minMemSize) {
selDevCoresNumber = coreNumber;
selDevIndex = devIndex;
selName = deviceProp.name;
}
}
}
}
printf("gpu device selected: %d, name: %s\n", selDevIndex, selName.c_str());
}
}
catch (...) {}
size_t memsize = GetTotalMemorySize(isDownload);
printf("%s physical memory(MB): %s\n", (isDownload ? "total " : "avail "), format_size(memsize * 1000 * 1000).c_str());
// 存在可用GPU,且物理内存超过5GiB
if (selDevIndex >= 0) {
if (memsize >= 5000) {
printf("[OK]Proper use of gpu device acceleration\n");
return true;
}
}
printf("Check whether the cpu kernel information is satisfied?\n");
int processorCoreCount = 0;
int logicalProcessorCount = 0;
int ret = getCpuCount(processorCoreCount, logicalProcessorCount);
if (ret < 0) {
printf("[Fail]get cpu count failed: %d\n", ret);
return false;
}
int corecount = processorCoreCount;
printf("physical cpu core count: %d\n", corecount);
// cpu超过4核,且内存大于2GiB
if (corecount >= 4) {
if (memsize >= 2000) {
printf("[OK]Correctly use cpu devices for calculations\n");
return true;
}
}
printf("[Fail]The hardware does not meet the conditions\n");
return false;
}
int main()
{
auto ret = checkHardwareCond();
std::cout << "Is the hardware sufficient to operate? " << ret << std::endl;
std::cout << "Hello World!\n";
}