CUDA抢占模式测试
- 默认为抢占模式,后台运行一个Kernel,将利用率打满
- 运行一个Kernel,Profing smsp__warps_restored,发现有值
- nvidia-smi -c 3设置为EXCLUSIVE模式,创建多个上下文时:CUDA-capable device(s) is/are busy or unavailable!
一.参考链接
二.复现过程
tee preemption.cu<<-'EOF'
#include <cuda_runtime.h>
#include <iostream>
__global__ void Kernel_v1(float *data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for(int i=0;i<1;i++)
{
atomicAdd(&data[idx], idx*i);
}
}
__global__ void Kernel_v2(float *data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
for(int i=0;i<1;i++)
{
atomicAdd(&data[idx], idx*i);
}
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -dc -lineinfo -arch=sm_86 -ptx preemption.cu -o preemption.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 preemption.ptx -cubin -o preemption.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 preemption.cubin -fatbin -o preemption.fatbin
cat preemption.ptx
/usr/local/cuda/bin/cuobjdump --dump-sass preemption.fatbin
tee preemption_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <thread>
int run(const char *kernel_name,int loop)
{
CUresult error;
cuInit(0);
CUdevice cuDevice;
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
error = cuDeviceGet(&cuDevice, 0);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in get device!\n");
return -1;
}
CUcontext cuContext;
error = cuCtxCreate(&cuContext, 0, cuDevice);
if(error!=CUDA_SUCCESS)
{
char *error_str=new char[1024];
cuGetErrorString(error,(const char**)&error_str);
printf("Error happened in create context:%s!\n",error_str);
delete []error_str;
return -1;
}
int block_count=100000;int block_size=1024;
int thread_size=block_count*block_size;
int data_size=sizeof(float)*thread_size;
float *output_ptr=nullptr;
float *input_ptr=nullptr;
int cudaStatus=0;
cudaStatus = cudaMalloc((void**)&input_ptr, data_size);
void *kernelParams[]= {(void*)&input_ptr};
CUmodule module;
CUfunction function;
const char* module_file = "preemption.fatbin";
error = cuModuleLoad(&module, module_file);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in load moudle %d!\n",error);
return -1;
}
error = cuModuleGetFunction(&function, module, kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get double_function error!\n");
return -1;
}
do
{
cuLaunchKernel(function,
block_count, 1, 1,
block_size, 1, 1,
0,0,kernelParams, 0);
cuCtxSynchronize();
}while(loop);
cudaFree(input_ptr);
cuModuleUnload(module);
cuCtxDestroy(cuContext);
printf("finished\n");
return 0;
}
int main(int argc,char *argv[])
{
int loop=atoi(argv[1]);
int value=0;
//获取是否支持抢占模式
cudaDeviceGetAttribute(&value,cudaDevAttrComputePreemptionSupported,0);
printf("cudaDevAttrComputePreemptionSupported:%d\n",value);
auto t0=new std::thread(run,"_Z9Kernel_v1Pf",loop);
auto t1=new std::thread(run,"_Z9Kernel_v2Pf",loop);
t0->join();
t1->join();
delete t0;
delete t1;
}
EOF
killall -9 preemption_main
g++ preemption_main.cpp -o preemption_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda -lpthread
./preemption_main 1 &
/usr/local/NVIDIA-Nsight-Compute/ncu --metrics smsp__warps_restored ./preemption_main 0
- 输出
smsp__warps_restored Counter warp # of warp preemption-restore events
Kernel_v2(float *) (100000, 1, 1)x(1024, 1, 1), Context 2, Stream 7, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------ ----------- ------------
Metric Name Metric Unit Metric Value
------------------------ ----------- ------------
smsp__warps_restored.avg warp 0
smsp__warps_restored.max warp 0
smsp__warps_restored.min warp 0
smsp__warps_restored.sum warp 0
------------------------ ----------- ------------
Kernel_v1(float *) (100000, 1, 1)x(1024, 1, 1), Context 1, Stream 14, Device 0, CC 8.6
Section: Command line profiler metrics
------------------------ ----------- ------------
Metric Name Metric Unit Metric Value
------------------------ ----------- ------------
smsp__warps_restored.avg warp 0.58
smsp__warps_restored.max warp 8
smsp__warps_restored.min warp 0
smsp__warps_restored.sum warp 65
------------------------ ----------- ------------
- EXCLUSIVE模式测试
#设置模式
nvidia-smi -c 3
Set compute mode to EXCLUSIVE_PROCESS for GPU 00000000:03:00.0.
All done.
#错误信息
Error happened in create context:CUDA-capable device(s) is/are busy or unavailable!
#恢复
nvidia-smi -c 0
Compute mode is already set to DEFAULT for GPU 00000000:03:00.0.
All done.
979

被折叠的 条评论
为什么被折叠?



