一.参考链接
二.小结
- NVIDIA_VISIBLE_DEVICES是nvidia-container的环境变量
- CUDA_VISIBLE_DEVICES 是CUDA SDK的环境变量
- 宿主机上CUDA_VISIBLE_DEVICES不会影响NVML和nvidia-smi的行为,会影响cuda api(cudaGetDeviceCount)的行为
- nvidia-container里CUDA_VISIBLE_DEVICES只能是容器里逻辑设备列表的子集,按CUDA_VISIBLE_DEVICES后面的设备顺序枚举,生成逻辑设备ID
三.生成测试程序
tee cuda_api_sample.cpp<<-'EOF'
#include <iostream>
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <stdio.h>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
int main(int argc,char *argv[])
{
int deviceCount;
CHECK_CUDA(cudaGetDeviceCount(&deviceCount));
printf("deviceCount:%d\n",deviceCount);
for(int i=0;i<deviceCount;i++)
{
int deviceid=i;
CHECK_CUDA(cudaSetDevice(deviceid));
cudaDeviceProp prop;
CHECK_CUDA(cudaGetDeviceProperties(&prop, deviceid));
char uuid_str[33];
for (int j = 0; j < 16; ++j) {
sprintf(&uuid_str[j * 2], "%02x", prop.uuid.bytes[j]);
}
uuid_str[32] = '\0';
std::cout << "Device:" <<deviceid<< " " <<prop.name <<" UUID:" << uuid_str << std::endl;
}
return 0;
}
EOF
g++ -o cuda_api_sample cuda_api_sample.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda -lcudart
tee nvml_sample.cpp<<-'EOF'
#include <stdio.h>
#include <nvml.h>
int main() {
nvmlReturn_t result;
unsigned int device_count, i;
nvmlDevice_t device;
char uuid[NVML_DEVICE_UUID_BUFFER_SIZE];
// Initialize NVML
result = nvmlInit();
if (NVML_SUCCESS != result) {
printf("Failed to initialize NVML: %s\n", nvmlErrorString(result));
return 1;
}
// Get the number of devices
result = nvmlDeviceGetCount(&device_count);
if (NVML_SUCCESS != result) {
printf("Failed to get device count: %s\n", nvmlErrorString(result));
nvmlShutdown();
return 1;
}
printf("Found %u devices\n", device_count);
// Iterate through devices and get UUID
for (i = 0; i < device_count; i++) {
result = nvmlDeviceGetHandleByIndex(i, &device);
if (NVML_SUCCESS != result) {
printf("Failed to get handle for device %u: %s\n", i, nvmlErrorString(result));
continue;
}
result = nvmlDeviceGetUUID(device, uuid, sizeof(uuid));
if (NVML_SUCCESS != result) {
printf("Failed to get UUID for device %u: %s\n", i, nvmlErrorString(result));
} else {
printf("Device %u UUID: %s\n", i, uuid);
}
}
// Shutdown NVML
nvmlShutdown();
return 0;
}
EOF
g++ -o nvml_sample nvml_sample.cpp -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcuda -lcudart -lnvidia-ml