windows 10/11判断GPU及是否支持CUDA

qiufeng_xinqing

已于 2024-03-14 21:39:42 修改

阅读量354

点赞数 1

文章标签： windows GPU是否存在 cuda core核心数量获取

于 2024-03-04 19:53:41 首次发布

本文链接：https://blog.csdn.net/qiufeng_xinqing/article/details/136460889

版权

windows判断GPU及是否支持CUDA

在安装有英伟达显示的机器上安装NVIDIA GPU Computing Toolkit，版本11+。在安装了NVToolkit电脑里找到nvcuda.dll（通常下该文件位于C:\windows\SysWOW64目录下）、cuda.lib（位于NVToolkit\CUDA\{Version}\lib目录下）和Cuda开发头文件。

以下代码为列表系统里的Cuda设备，没有的话cuDeviceGetCount()会返回相应的错误码，错误码CUDA_ERROR_NOT_INITIALIZED即是没有设备。

#include <iostream>

#include <cuda.h>
#pragma comment(lib, "cuda.lib")

int main()
{
	uint32_t flag = 0;
    auto ret = cuInit(flag); // 没有装驱动的机器会抛异常
    int count = 0;
    ret = cuDeviceGetCount(&count);
    if (ret != CUDA_SUCCESS) {
        std::cout << "No GPU device, ret:" << ret << std::endl;
    }
    else {
        std::cout << "device count:" << count << ", ret=" << ret << std::endl;
    }

    std::cout << "Hello World!\n";
}

获取GPU Cuda Core核心数量

#ifdef _WIN32
#include <windows.h>

static int getCpuCount(int& processorCoreCount, int& logicalProcessorCount)
{
    typedef BOOL(WINAPI* LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD);
    // Helper function to count set bits in the processor mask.
    auto CountSetBits = [](ULONG_PTR bitMask) -> DWORD
    {
        DWORD LSHIFT = sizeof(ULONG_PTR) * 8 - 1;
        DWORD bitSetCount = 0;
        ULONG_PTR bitTest = (ULONG_PTR)1 << LSHIFT;
        DWORD i;

        for (i = 0; i <= LSHIFT; ++i)
        {
            bitSetCount += ((bitMask & bitTest) ? 1 : 0);
            bitTest /= 2;
        }

        return bitSetCount;
    };

    LPFN_GLPI glpi;
    BOOL done = FALSE;
    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = NULL;
    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = NULL;
    DWORD returnLength = 0;
    DWORD byteOffset = 0;

    auto handle = GetModuleHandle(TEXT("kernel32"));
    if (handle == nullptr) {
        printf("get kernel32 handle failed.\n");
        return -1;
    }

    glpi = (LPFN_GLPI)GetProcAddress(handle, "GetLogicalProcessorInformation");
    if (NULL == glpi)
    {
        printf("GetLogicalProcessorInformation is not supported.\n");
        return -2;
    }

    DWORD rc = glpi(nullptr, &returnLength);
    buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(returnLength);
    rc = glpi(buffer, &returnLength);

    if (buffer == nullptr) {
        printf("GetLogicalProcessorInformation() failed\n");
        return -3;
    }
    ptr = buffer;

    while (byteOffset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= returnLength)
    {
        switch (ptr->Relationship)
        {
        case RelationProcessorCore:
            processorCoreCount++;

            // A hyperthreaded core supplies more than one logical processor.
            logicalProcessorCount += CountSetBits(ptr->ProcessorMask);
            break;

        default:
            break;
        }
        byteOffset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION);
        ptr++;
    }

    free(buffer);
    return 0;
}
#else
#include <thread>

int getCpuCount(int& processorCoreCount, int& logicalProcessorCount)
{
    processorCoreCount = logicalProcessorCount = std::thread::hardware_concurrency();
    return 0;
}
#endif

//得到内存总大小（单位MB）
// 下载时获取总物理内存
// 运行时获取可用物理内存
static ULONG GetTotalMemorySize(bool isConstPhys)
{
#ifdef _WIN32
    MEMORYSTATUSEX statex;
    statex.dwLength = sizeof(statex);
    GlobalMemoryStatusEx(&statex);
    return (ULONG)((isConstPhys ? statex.ullTotalPhys : statex.ullAvailPhys) / (1024 * 1024));
#else
    return 0;
#endif
}

#include <cuda_runtime.h>
#ifdef _WIN32
#pragma comment(lib, "cudart_static.lib")
#endif

// 函数来自官方：https://github.com/NVIDIA/cuda-samples/blob/master/Common/helper_cuda.h
// Beginning of GPU Architecture definitions
static int _ConvertSMVer2Cores(int major, int minor) {
    // Defines for GPU Architecture types (using the SM version to determine
    // the # of cores per SM
    typedef struct {
        int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
        // and m = SM minor version
        int Cores;
    } sSMtoCores;

    sSMtoCores nGpuArchCoresPerSM[] = {
        {0x30, 192},
        {0x32, 192},
        {0x35, 192},
        {0x37, 192},
        {0x50, 128},
        {0x52, 128},
        {0x53, 128},
        {0x60,  64},
        {0x61, 128},
        {0x62, 128},
        {0x70,  64},
        {0x72,  64},
        {0x75,  64},
        {0x80,  64},
        {0x86, 128},
        {0x87, 128},
        {0x89, 128},
        {0x90, 128},
        {-1, -1} };

    int index = 0;

    while (nGpuArchCoresPerSM[index].SM != -1) {
        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
            return nGpuArchCoresPerSM[index].Cores;
        }

        index++;
    }

    return nGpuArchCoresPerSM[index - 1].Cores;
}

#include <string>
std::string format_size(size_t size)
{
    if (size > (1000 * 1000 * 1000)) {
        return std::to_string(size / (1000 * 1000 * 1000)) + " GB";
    }
    else if (size > (1000 * 1000)) {
        return std::to_string(size / (1000 * 1000)) + " MB";
    }
    else if (size > (1000)) {
        return std::to_string(size / (1000)) + " KB";
    }
    else {
        return std::to_string(size) + " bytes";
    }
}

bool checkHardwareCond(bool isDownload = true)
{
    int selDevIndex = -1;

    try {
        int cuCount = 0;
        cudaError_t curet = cudaGetDeviceCount(&cuCount);
        if (curet != cudaSuccess) {
            printf("No GPU device, ret:%s\n", cudaGetErrorString(curet));
        }
        else {
            printf("gpu device count:%d\n", cuCount);

            size_t minMemSize = ((size_t)3 * 1000 * 1000 * 1000);
            std::string selName;
            int selDevCoresNumber = 0;
            for (int devIndex = 0; devIndex < cuCount; devIndex++) {
                curet = cudaSetDevice(devIndex);
                cudaDeviceProp deviceProp;
                curet = cudaGetDeviceProperties(&deviceProp, devIndex);
                int coreNumber = deviceProp.multiProcessorCount * _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);//deviceProp.warpSize;

                size_t freeMem = 0;
                size_t totMem = 0;
                curet = cudaMemGetInfo(&freeMem, &totMem);

                printf("gpu device:%d, cuda cores:%d, multiProcessorCount:%d, concurrentKernels:%d"
                    ", maxThreadsPerMultiProcessor:%d, regsPerBlock:%d, warpSize:%d, clockRate:%d, major:%d"
                    ", minor:%d, asyncEngineCount:%d, maxBlocksPerMultiProcessor:%d, totalGlobalMem:%s"
                    ", totalConstMem:%s, freeMem:%s, totMem:%s, name:%s\n",
                    devIndex, coreNumber, deviceProp.multiProcessorCount, deviceProp.concurrentKernels,
                    deviceProp.maxThreadsPerMultiProcessor, deviceProp.regsPerBlock, deviceProp.warpSize,
                    deviceProp.clockRate, deviceProp.major, deviceProp.minor, deviceProp.asyncEngineCount,
                    deviceProp.maxBlocksPerMultiProcessor, format_size(deviceProp.totalGlobalMem).c_str(), format_size(deviceProp.totalConstMem).c_str(),
                    format_size(freeMem).c_str(), format_size(totMem).c_str(), deviceProp.name);

                // 显存总大小大于 3GB
                if (selDevCoresNumber < coreNumber && deviceProp.totalGlobalMem >= minMemSize) {
                    // 下载时不判断可用显存
                    if (isDownload) {
                        selDevCoresNumber = coreNumber;
                        selDevIndex = devIndex;
                        selName = deviceProp.name;
                    }
                    else {
                        // 运行时判断可用显存 >= 3GB
                        if (freeMem >= minMemSize) {
                            selDevCoresNumber = coreNumber;
                            selDevIndex = devIndex;
                            selName = deviceProp.name;
                        }
                    }
                }
            }

            printf("gpu device selected: %d, name: %s\n", selDevIndex, selName.c_str());
        }
    }
    catch (...) {}

    size_t memsize = GetTotalMemorySize(isDownload);
    printf("%s physical memory(MB): %s\n", (isDownload ? "total " : "avail "), format_size(memsize * 1000 * 1000).c_str());

    // 存在可用GPU，且物理内存超过5GiB
    if (selDevIndex >= 0) {
        if (memsize >= 5000) {
            printf("[OK]Proper use of gpu device acceleration\n");
            return true;
        }
    }
    printf("Check whether the cpu kernel information is satisfied?\n");

    int processorCoreCount = 0;
    int logicalProcessorCount = 0;
    int ret = getCpuCount(processorCoreCount, logicalProcessorCount);
    if (ret < 0) {
        printf("[Fail]get cpu count failed: %d\n", ret);
        return false;
    }
    int corecount = processorCoreCount;
    printf("physical cpu core count: %d\n", corecount);
    // cpu超过4核，且内存大于2GiB
    if (corecount >= 4) {
        if (memsize >= 2000) {
            printf("[OK]Correctly use cpu devices for calculations\n");
            return true;
        }
    }
    printf("[Fail]The hardware does not meet the conditions\n");
    return false;
}

int main()
{
    auto ret = checkHardwareCond();
    std::cout << "Is the hardware sufficient to operate? " << ret << std::endl;

    std::cout << "Hello World!\n";
}