CUDA Green Contexts 是 CUDA 生态系统中的一个功能,主要用于提升计算的并行性与效率,尤其是针对多线程的应用场景。它们的作用如下:
- 上下文管理:CUDA Green Contexts 允许多个 CUDA 上下文在同一个线程中并行执行。通常情况下,每个线程只能持有一个 CUDA 上下文,但通过绿色上下文,可以在同一线程中管理多个上下文,这样能够减少切换上下文的开销。
- 资源隔离:绿色上下文提供了一定程度的资源隔离,可以让每个上下文在执行时不干扰其他上下文,这对于需要高并发的任务尤为重要。
- 提高吞吐量:通过在多个上下文之间进行并行处理,能够提高 GPU 的使用率,从而提升整体计算吞吐量。适合需要执行多个独立运算的场景,比如深度学习中使用多个模型一起训练或推理的情况。
- 优化多线程应用:对于多线程的应用程序,CUDA Green Contexts 可以有效降低因上下文切换导致的性能损失,使得多线程的 CUDA 应用能更顺畅地运行。
- 异步执行:绿色上下文能更好地支持异步执行,允许CUDA操作在后台进行,而不必等待其他上下文的完成,从而提升了整体的执行效率。
总结来说,CUDA Green Contexts 旨在通过优化上下文管理和资源使用,提高多线程 CUDA 程序的性能,特别适用于高并发和复杂的计算场景。
一.测试输出及小结
E2E:4480.83ms #一个普通的context,运行1次kernel
E2E:4508.36ms #一个普通的context,将任务拆成7份 lanuch kernel
E2E:4506.14ms #7个green context,将任务拆成7份 分给7个green context,同时运行
E2E:4963.89ms #7个普通的context,将任务拆成7份 分给7个普通context
小结:
- 如果计算任务能用一个kernel lanuch一次完成,性能是最优的
- 如果将规模拆成N份,多次lanuch,性能会有下降(因为每次启动 kernel 都会有一定的开销,尤其是等待 kernel 完成以及 CPU 调度的延迟)
- 如果将规模拆成N份,用N个green context,将GPU所有的SM拆成N份,同时执行,性能略好
- 如果将规模拆成N份,用N个普通 context,通过context切换,性能明显下降(因为上下文切换会带来额外的开销,可能会导致性能下降)
- 也就是说,如果任务需要执行多个Kernel才能完成,并且每个Kernel会能用满所有的SM,则green context会有性能收益
二.参考链接
三.复现过程
tee green_contexts.cu<<-'EOF'
#include <cuda_runtime.h>
#include <iostream>
__global__ void Kernel_v1(float *data) {
int idx = threadIdx.x + blockIdx.x * blockDim.x;
float val=data[idx];
for(int i=0;i<10240;i++)
{
val+=sin(i);
}
data[idx]=val;
}
EOF
/usr/local/cuda/bin/nvcc -std=c++17 -dc -arch=sm_86 -ptx green_contexts.cu -o green_contexts.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 green_contexts.ptx -cubin -o green_contexts.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 green_contexts.cubin -fatbin -o green_contexts.fatbin
tee green_contexts_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <thread>
#include <chrono>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>
#define CHECK_CUDA(call) \
do { \
cudaError_t err = call; \
if (err != cudaSuccess) { \
std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
exit(EXIT_FAILURE); \
} \
} while (0)
const char* module_file = "green_contexts.fatbin";
int regular_ctx1(const char *kernel_name,int blk_count,int loop)
{
CUresult error;
cuInit(0);
CUdevice cuDevice;
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
error = cuDeviceGet(&cuDevice, 0);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in get device!\n");
return -1;
}
CUcontext cuContext;
error = cuCtxCreate(&cuContext, 0, cuDevice);
if(error!=CUDA_SUCCESS)
{
char *error_str=new char[1024];
cuGetErrorString(error,(const char**)&error_str);
printf("Error happened in create context:%s!\n",error_str);
delete []error_str;
return -1;
}
int block_count=blk_count*10000;
int block_size=1024;
int thread_size=block_count*block_size;
int data_size=sizeof(float)*thread_size;
float *input_ptr=nullptr;
CHECK_CUDA(cudaMalloc((void**)&input_ptr, data_size));
void *kernelParams[]= {(void*)&input_ptr};
CUmodule module;
CUfunction function;
error = cuModuleLoad(&module, module_file);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in load moudle %d!\n",error);
return -1;
}
error = cuModuleGetFunction(&function, module, kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get double_function error!\n");
return -1;
}
cuLaunchKernel(function,block_count,1,1,block_size,1,1,0,0,kernelParams, 0);
error=cuCtxSynchronize();
if(error!=CUDA_SUCCESS)
{
printf("Error cuCtxSynchronize!: %d\n",error);
return -1;
}
auto start = std::chrono::high_resolution_clock::now();
for(int i=0;i<loop;i++)
{
cuLaunchKernel(function,block_count,1,1,block_size,1,1,0,0,kernelParams,0);
}
error=cuCtxSynchronize();
if(error!=CUDA_SUCCESS)
{
printf("Error cuCtxSynchronize!: %d\n",error);
return -1;
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
printf("E2E:%7.2fms\n",diff.count()*1000);
cudaFree(input_ptr);
cuModuleUnload(module);
cuCtxDestroy(cuContext);
return 0;
}
int regular_ctx7_blk1_loop1(const char *kernel_name)
{
CUresult error;
cuInit(0);
CUdevice cuDevice;
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
error = cuDeviceGet(&cuDevice, 0);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in get device!\n");
return -1;
}
int block_count=10000;
int block_size=1024;
int thread_size=block_count*block_size;
int data_size=sizeof(float)*thread_size;
CUmodule module[128]={0};
CUfunction function[128]={0};
CUcontext cuContext[128]={0};
CUstream hStream[128]={0};
float *ptrs[128]={0};
int nbGroups=7;
for(int i=0;i<nbGroups;i++)
{
error = cuCtxCreate(&cuContext[i], 0, cuDevice);
if(error!=CUDA_SUCCESS)
{
char *error_str=new char[1024];
cuGetErrorString(error,(const char**)&error_str);
printf("Error happened in create context:%s!\n",error_str);
delete []error_str;
return -1;
}
cuCtxSetCurrent(cuContext[i]);
cudaMalloc((void**)&ptrs[i],data_size);
cuStreamCreate(&hStream[i],CU_STREAM_NON_BLOCKING);
void *kernelParams[]= {(void*)&ptrs[i]};
error = cuModuleLoad(&module[i], module_file);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in load moudle %d!\n",error);
return -1;
}
error = cuModuleGetFunction(&function[i], module[i], kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get double_function error!\n");
return -1;
}
cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams, 0);
error=cuStreamSynchronize(hStream[i]);
if(error!=CUDA_SUCCESS)
{
printf("Error cuStreamSynchronize!:%d %d\n",i,error);
return -1;
}
}
auto start = std::chrono::high_resolution_clock::now();
for(int i=0;i<nbGroups;i++)
{
void *kernelParams[]= {(void*)&ptrs[i]};
cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams,0);
}
for(int i=0;i<nbGroups;i++)
{
if(hStream[i]==0) break;
error=cuStreamSynchronize(hStream[i]);
if(error!=CUDA_SUCCESS)
{
printf("Error cuStreamSynchronize!:%d %d\n",i,error);
return -1;
}
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
printf("E2E:%7.2fms\n",diff.count()*1000);
for(int i=0;i<nbGroups;i++)
{
if(hStream[i]==0) break;
cuStreamDestroy(hStream[i]);
cuCtxDestroy(cuContext[i]);
cudaFree(ptrs[i]);
cuModuleUnload(module[i]);
}
return 0;
}
int green_ctx7_blk1_loop1(const char *kernel_name)
{
CUresult error;
cuInit(0);
CUdevice cuDevice;
int deviceCount = 0;
error = cuDeviceGetCount(&deviceCount);
error = cuDeviceGet(&cuDevice, 0);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in get device!\n");
return -1;
}
CUdevResource resource;
error=cuDeviceGetDevResource(cuDevice,&resource,CU_DEV_RESOURCE_TYPE_SM);//Get device resources.
if(error!=CUDA_SUCCESS)
{
printf("Error cuDeviceGetDevResource!\n");
return -1;
}
unsigned int nbGroups=7;
CUdevResource output_resource[128];
error=cuDevSmResourceSplitByCount(output_resource,&nbGroups,&resource,0,CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING,4);
if(error!=CUDA_SUCCESS)
{
printf("Error cuDevSmResourceSplitByCount:%d!\n",error);
return -1;
}
CUdevResourceDesc hDesc[128];
error=cuDevResourceGenerateDesc(hDesc,output_resource,nbGroups);
if(error!=CUDA_SUCCESS)
{
printf("Error cuDevResourceGenerateDesc!\n");
return -1;
}
int block_count=10000;
int block_size=1024;
int thread_size=block_count*block_size;
int data_size=sizeof(float)*thread_size;
CUmodule module[128]={0};
CUfunction function[128]={0};
CUgreenCtx hCtx[128]={0};
CUstream hStream[128]={0};
float *ptrs[128]={0};
for(int i=0;i<nbGroups;i++)
{
error=cuGreenCtxCreate(&hCtx[i],hDesc[i],cuDevice,CU_GREEN_CTX_DEFAULT_STREAM );
if(error!=CUDA_SUCCESS)
{
char *error_str=new char[1024];
cuGetErrorString(error,(const char**)&error_str);
printf("Error happened in create context:%s!\n",error_str);
delete []error_str;
return -1;
}
error=cuGreenCtxStreamCreate(&hStream[i],hCtx[i],CU_STREAM_NON_BLOCKING,0);
if(error!=CUDA_SUCCESS)
{
printf("Error cuGreenCtxStreamCreate!\n");
return -1;
}
CUcontext Context;
cuCtxFromGreenCtx (&Context,hCtx[i]);
cuCtxSetCurrent(Context);
error = cuModuleLoad(&module[i], module_file);
if(error!=CUDA_SUCCESS)
{
printf("Error happened in load moudle %d!\n",error);
return -1;
}
error = cuModuleGetFunction(&function[i], module[i], kernel_name);
if(error!=CUDA_SUCCESS)
{
printf("get double_function error!\n");
return -1;
}
cudaMalloc((void**)&ptrs[i],data_size);
void *kernelParams[]= {(void*)&ptrs[i]};
cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams,0);
error=cuStreamSynchronize(hStream[i]);
if(error!=CUDA_SUCCESS)
{
printf("Error cuStreamSynchronize!:%d %d\n",i,error);
return -1;
}
}
auto start = std::chrono::high_resolution_clock::now();
for(int i=0;i<nbGroups;i++)
{
void *kernelParams[]= {(void*)&ptrs[i]};
cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams,0);
}
for(int i=0;i<nbGroups;i++)
{
if(hStream[i]==0) break;
error=cuStreamSynchronize(hStream[i]);
if(error!=CUDA_SUCCESS)
{
printf("Error cuStreamSynchronize!:%d %d\n",i,error);
return -1;
}
}
auto end = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> diff = end - start;
printf("E2E:%7.2fms\n",diff.count()*1000);
for(int i=0;i<nbGroups;i++)
{
if(hStream[i]==0) break;
cuStreamDestroy(hStream[i]);
cuGreenCtxDestroy(hCtx[i]);
cudaFree(ptrs[i]);
cuModuleUnload(module[i]);
}
return 0;
}
int main(int argc,char *argv[])
{
const char *kernel_name="_Z9Kernel_v1Pf";
regular_ctx1(kernel_name,7,1);
regular_ctx1(kernel_name,1,7);
green_ctx7_blk1_loop1(kernel_name);
regular_ctx7_blk1_loop1(kernel_name);
return 0;
}
EOF
g++ green_contexts_main.cpp -o green_contexts_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda -lpthread
./green_contexts_main
853

被折叠的 条评论
为什么被折叠?



