CUDA Green Contexts测试

CUDA Green Contexts 是 CUDA 生态系统中的一个功能,主要用于提升计算的并行性与效率,尤其是针对多线程的应用场景。它们的作用如下:

  1. 上下文管理:CUDA Green Contexts 允许多个 CUDA 上下文在同一个线程中并行执行。通常情况下,每个线程只能持有一个 CUDA 上下文,但通过绿色上下文,可以在同一线程中管理多个上下文,这样能够减少切换上下文的开销。
  2. 资源隔离:绿色上下文提供了一定程度的资源隔离,可以让每个上下文在执行时不干扰其他上下文,这对于需要高并发的任务尤为重要。
  3. 提高吞吐量:通过在多个上下文之间进行并行处理,能够提高 GPU 的使用率,从而提升整体计算吞吐量。适合需要执行多个独立运算的场景,比如深度学习中使用多个模型一起训练或推理的情况。
  4. 优化多线程应用:对于多线程的应用程序,CUDA Green Contexts 可以有效降低因上下文切换导致的性能损失,使得多线程的 CUDA 应用能更顺畅地运行。
  5. 异步执行:绿色上下文能更好地支持异步执行,允许CUDA操作在后台进行,而不必等待其他上下文的完成,从而提升了整体的执行效率。

总结来说,CUDA Green Contexts 旨在通过优化上下文管理和资源使用,提高多线程 CUDA 程序的性能,特别适用于高并发和复杂的计算场景。

一.测试输出及小结

E2E:4480.83ms  #一个普通的context,运行1次kernel
E2E:4508.36ms  #一个普通的context,将任务拆成7份 lanuch kernel
E2E:4506.14ms  #7个green context,将任务拆成7份 分给7个green context,同时运行
E2E:4963.89ms  #7个普通的context,将任务拆成7份 分给7个普通context

小结:

  • 如果计算任务能用一个kernel lanuch一次完成,性能是最优的
  • 如果将规模拆成N份,多次lanuch,性能会有下降(因为每次启动 kernel 都会有一定的开销,尤其是等待 kernel 完成以及 CPU 调度的延迟)
  • 如果将规模拆成N份,用N个green context,将GPU所有的SM拆成N份,同时执行,性能略好
  • 如果将规模拆成N份,用N个普通 context,通过context切换,性能明显下降(因为上下文切换会带来额外的开销,可能会导致性能下降)
  • 也就是说,如果任务需要执行多个Kernel才能完成,并且每个Kernel会能用满所有的SM,则green context会有性能收益

二.参考链接

三.复现过程

tee green_contexts.cu<<-'EOF'
#include <cuda_runtime.h>
#include <iostream>

__global__ void Kernel_v1(float *data) {
    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    float val=data[idx];
    for(int i=0;i<10240;i++)
    {
        val+=sin(i);
    }
    data[idx]=val;
}
EOF

/usr/local/cuda/bin/nvcc -std=c++17 -dc -arch=sm_86 -ptx green_contexts.cu -o green_contexts.ptx
/usr/local/cuda/bin/nvcc -arch=sm_86 green_contexts.ptx -cubin -o green_contexts.cubin
/usr/local/cuda/bin/nvcc -arch=sm_86 green_contexts.cubin -fatbin -o green_contexts.fatbin

tee green_contexts_main.cpp<<-'EOF'
#include <stdio.h>
#include <string.h>
#include <cuda_runtime.h>
#include <cuda.h>
#include <thread>
#include <chrono>
#include <assert.h>
#include <cstdio>
#include <cuda.h>
#include <iostream>

#define CHECK_CUDA(call)                      \
  do {                              \
    cudaError_t err = call;                  \
    if (err != cudaSuccess) {                 \
      std::cerr << "CUDA error at " << __FILE__ << ":" << __LINE__; \
      std::cerr << " code=" << err << " (" << cudaGetErrorString(err) << ")" << std::endl; \
      exit(EXIT_FAILURE);                  \
    }                                      \
  } while (0)

const char* module_file = "green_contexts.fatbin";    
    
int regular_ctx1(const char *kernel_name,int blk_count,int loop)
{
    CUresult error;
    cuInit(0);
    CUdevice cuDevice;
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    error = cuDeviceGet(&cuDevice, 0);
    if(error!=CUDA_SUCCESS)
    {
        printf("Error happened in get device!\n");
        return -1;
    }
    CUcontext cuContext;
    error = cuCtxCreate(&cuContext, 0, cuDevice);
    if(error!=CUDA_SUCCESS)
    {
        char *error_str=new char[1024];
        cuGetErrorString(error,(const char**)&error_str);
        printf("Error happened in create context:%s!\n",error_str);
        delete []error_str;
        return -1;
    }
    int block_count=blk_count*10000;
    int block_size=1024;
    int thread_size=block_count*block_size;
    int data_size=sizeof(float)*thread_size;

    float *input_ptr=nullptr;
    CHECK_CUDA(cudaMalloc((void**)&input_ptr, data_size));
    void *kernelParams[]= {(void*)&input_ptr};

    CUmodule module;
    CUfunction function;
    error = cuModuleLoad(&module, module_file);
    if(error!=CUDA_SUCCESS)
    {
        printf("Error happened in load moudle %d!\n",error);
        return -1;
    }
    error = cuModuleGetFunction(&function, module, kernel_name);
    if(error!=CUDA_SUCCESS)
    {
        printf("get double_function error!\n");
        return -1;
    }
    cuLaunchKernel(function,block_count,1,1,block_size,1,1,0,0,kernelParams, 0);
    error=cuCtxSynchronize();
    if(error!=CUDA_SUCCESS)
    {
        printf("Error cuCtxSynchronize!: %d\n",error);
        return -1;
    }
    auto start = std::chrono::high_resolution_clock::now();
    for(int i=0;i<loop;i++)
    {
        cuLaunchKernel(function,block_count,1,1,block_size,1,1,0,0,kernelParams,0);
    }
    error=cuCtxSynchronize();
    if(error!=CUDA_SUCCESS)
    {
        printf("Error cuCtxSynchronize!: %d\n",error);
        return -1;
    }    
    auto end = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> diff = end - start; 
    printf("E2E:%7.2fms\n",diff.count()*1000);
    
    cudaFree(input_ptr);
    cuModuleUnload(module);
    cuCtxDestroy(cuContext);
    return 0;
}

int regular_ctx7_blk1_loop1(const char *kernel_name)
{
    CUresult error;
    cuInit(0);
    CUdevice cuDevice;
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    error = cuDeviceGet(&cuDevice, 0);
    if(error!=CUDA_SUCCESS)
    {
        printf("Error happened in get device!\n");
        return -1;
    }
    
    int block_count=10000;
    int block_size=1024;
    int thread_size=block_count*block_size;
    int data_size=sizeof(float)*thread_size;
        
    CUmodule module[128]={0};
    CUfunction function[128]={0};
    CUcontext cuContext[128]={0};
    CUstream hStream[128]={0};
    float *ptrs[128]={0};    
    int nbGroups=7;
    
    for(int i=0;i<nbGroups;i++)
    {
        error = cuCtxCreate(&cuContext[i], 0, cuDevice);
        if(error!=CUDA_SUCCESS)
        {
            char *error_str=new char[1024];
            cuGetErrorString(error,(const char**)&error_str);
            printf("Error happened in create context:%s!\n",error_str);
            delete []error_str;
            return -1;
        }
        cuCtxSetCurrent(cuContext[i]);
        cudaMalloc((void**)&ptrs[i],data_size);
        cuStreamCreate(&hStream[i],CU_STREAM_NON_BLOCKING);
        void *kernelParams[]= {(void*)&ptrs[i]};
        error = cuModuleLoad(&module[i], module_file);
        if(error!=CUDA_SUCCESS)
        {
            printf("Error happened in load moudle %d!\n",error);
            return -1;
        }
        error = cuModuleGetFunction(&function[i], module[i], kernel_name);
        if(error!=CUDA_SUCCESS)
        {
            printf("get double_function error!\n");
            return -1;
        }
        cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams, 0);
        error=cuStreamSynchronize(hStream[i]);
        if(error!=CUDA_SUCCESS)
        {
            printf("Error cuStreamSynchronize!:%d %d\n",i,error);
            return -1;
        }
    }
    
    auto start = std::chrono::high_resolution_clock::now();
    for(int i=0;i<nbGroups;i++)
    {
        void *kernelParams[]= {(void*)&ptrs[i]};
        cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams,0);  
    }
    
    for(int i=0;i<nbGroups;i++)
    {
        if(hStream[i]==0) break;
        error=cuStreamSynchronize(hStream[i]);
        if(error!=CUDA_SUCCESS)
        {
            printf("Error cuStreamSynchronize!:%d %d\n",i,error);
            return -1;
        }
    }
    auto end = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> diff = end - start; 
    printf("E2E:%7.2fms\n",diff.count()*1000);
    
    for(int i=0;i<nbGroups;i++)
    {
        if(hStream[i]==0) break;
        cuStreamDestroy(hStream[i]);
        cuCtxDestroy(cuContext[i]);
        cudaFree(ptrs[i]);
        cuModuleUnload(module[i]);
    }
    return 0;
}

int green_ctx7_blk1_loop1(const char *kernel_name)
{
    CUresult error;
    cuInit(0);
    CUdevice cuDevice;
    int deviceCount = 0;
    error = cuDeviceGetCount(&deviceCount);
    error = cuDeviceGet(&cuDevice, 0);
    if(error!=CUDA_SUCCESS)
    {
        printf("Error happened in get device!\n");
        return -1;
    }
    CUdevResource resource;
    error=cuDeviceGetDevResource(cuDevice,&resource,CU_DEV_RESOURCE_TYPE_SM);//Get device resources.
    if(error!=CUDA_SUCCESS)
    {
        printf("Error cuDeviceGetDevResource!\n");
        return -1;
    }
    unsigned int nbGroups=7;
    CUdevResource output_resource[128];
    error=cuDevSmResourceSplitByCount(output_resource,&nbGroups,&resource,0,CU_DEV_SM_RESOURCE_SPLIT_IGNORE_SM_COSCHEDULING,4);
    if(error!=CUDA_SUCCESS)
    {
        printf("Error cuDevSmResourceSplitByCount:%d!\n",error);
        return -1;
    }      
    CUdevResourceDesc hDesc[128];
    error=cuDevResourceGenerateDesc(hDesc,output_resource,nbGroups);
    if(error!=CUDA_SUCCESS)
    {
        printf("Error cuDevResourceGenerateDesc!\n");
        return -1;
    } 

    int block_count=10000;
    int block_size=1024;
    int thread_size=block_count*block_size;
    int data_size=sizeof(float)*thread_size;

    CUmodule module[128]={0};
    CUfunction function[128]={0};
    CUgreenCtx hCtx[128]={0};
    CUstream hStream[128]={0};
    float *ptrs[128]={0};
    
    for(int i=0;i<nbGroups;i++)
    {
        error=cuGreenCtxCreate(&hCtx[i],hDesc[i],cuDevice,CU_GREEN_CTX_DEFAULT_STREAM );
        if(error!=CUDA_SUCCESS)
        {
            char *error_str=new char[1024];
            cuGetErrorString(error,(const char**)&error_str);
            printf("Error happened in create context:%s!\n",error_str);
            delete []error_str;
            return -1;
        }        
        error=cuGreenCtxStreamCreate(&hStream[i],hCtx[i],CU_STREAM_NON_BLOCKING,0);
        if(error!=CUDA_SUCCESS)
        {
            printf("Error cuGreenCtxStreamCreate!\n");
            return -1;
        }
        CUcontext Context;
        cuCtxFromGreenCtx (&Context,hCtx[i]);
        cuCtxSetCurrent(Context);
        error = cuModuleLoad(&module[i], module_file);
        if(error!=CUDA_SUCCESS)
        {
            printf("Error happened in load moudle %d!\n",error);
            return -1;
        }
        error = cuModuleGetFunction(&function[i], module[i], kernel_name);
        if(error!=CUDA_SUCCESS)
        {
            printf("get double_function error!\n");
            return -1;
        }        
        cudaMalloc((void**)&ptrs[i],data_size);
        void *kernelParams[]= {(void*)&ptrs[i]};
        cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams,0);
        error=cuStreamSynchronize(hStream[i]);
        if(error!=CUDA_SUCCESS)
        {
            printf("Error cuStreamSynchronize!:%d %d\n",i,error);
            return -1;
        }
    }
    auto start = std::chrono::high_resolution_clock::now();
    for(int i=0;i<nbGroups;i++)
    {
        void *kernelParams[]= {(void*)&ptrs[i]};
        cuLaunchKernel(function[i],block_count,1,1,block_size,1,1,0,hStream[i],kernelParams,0);  
    }
    
    for(int i=0;i<nbGroups;i++)
    {
        if(hStream[i]==0) break;
        error=cuStreamSynchronize(hStream[i]);
        if(error!=CUDA_SUCCESS)
        {
            printf("Error cuStreamSynchronize!:%d %d\n",i,error);
            return -1;
        }
    }
    auto end = std::chrono::high_resolution_clock::now(); 
    std::chrono::duration<double> diff = end - start; 
    printf("E2E:%7.2fms\n",diff.count()*1000);
    
    for(int i=0;i<nbGroups;i++)
    {
        if(hStream[i]==0) break;
        cuStreamDestroy(hStream[i]);
        cuGreenCtxDestroy(hCtx[i]);
        cudaFree(ptrs[i]);
        cuModuleUnload(module[i]);
    }
    return 0;
}

int main(int argc,char *argv[])
{
    const char *kernel_name="_Z9Kernel_v1Pf";
    regular_ctx1(kernel_name,7,1);
    regular_ctx1(kernel_name,1,7);    
    green_ctx7_blk1_loop1(kernel_name);    
    regular_ctx7_blk1_loop1(kernel_name);  
    return 0;
}
EOF
g++ green_contexts_main.cpp -o green_contexts_main -I /usr/local/cuda/include -L /usr/local/cuda/lib64 -lcudart -lcuda -lpthread
./green_contexts_main
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Hi20240217

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值