1. 参考 nv 的 __cudaRegisterFatBinary
nv的这个函数的实现藏在cuda sdk中,但是这个函数的调用源代码在nv cuda sdk 的安装好的头文件中;
比如在cuda 12.1中调用源代码如下:
#define __cudaRegisterBinary(X) \
__cudaFatCubinHandle = __cudaRegisterFatBinary((void*)&__fatDeviceText); \
{ void (*callback_fp)(void **) = (void (*)(void **))(X); (*callback_fp)(__cudaFatCubinHandle); __cudaRegisterFatBinaryEnd(__cudaFatCubinHandle); }\
atexit(__cudaUnregisterBinaryUtil)
#define __cudaRegisterVariable(handle, var, ext, size, constant, global) \
__cudaRegisterVar(handle, (char*)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterManagedVariable(handle, var, ext, size, constant, global) \
__cudaRegisterManagedVar(handle, (void **)&__host##var, (char*)__device##var, __name##var, ext, size, constant, global)
#define __cudaRegisterGlobalTexture(handle, tex, dim, norm, ext) \
__cudaRegisterTexture(handle, (const struct textureReference*)&tex, (const void**)(void*)__device##tex, __name##tex, dim, norm, ext)
#define __cudaRegisterGlobalSurface(handle, surf, dim, ext) \
__cudaRegisterSurface(handle, (const struct surfaceReference*)&surf, (const void**)(void*)__device##surf, __name##surf, dim, ext)
#define __cudaRegisterEntry(handle, funptr, fun, thread_limit) \
__cudaRegisterFunction(handle, (const char*)funptr, (char*)__device_fun(fun), #fun, -1, (uint3*)0, (uint3*)0, (dim3*)0, (dim3*)0, (int*)0)
这个函数会被编译器注入进 cuda app 的 main 数之前先调用;
起到的作用大概是调用进 cuda runtime,把 cuda app中需要使用的 fatbin 和其中的cubin 解析出来,并发送给 gpu 预备好调用;
fatbin文件放置在 cuda app 文件的 nv官方自定义段 nvFatBinSegment 中;
2. gpgpu-sim 的 __cudaRegisterFatBinary
官方文档:
__cudaRegisterBinary(void*) 被执行到的代码逻辑如下:
void** CUDARTAPI __cudaRegisterFatBinary( void *fatCubin )
{
#if (CUDART_VERSION < 2010)
printf("GPGPU-Sim PTX: ERROR ** this version of GPGPU-Sim requires CUDA 2.1 or higher\n");
exit(1);
#endif
CUctx_st *context = GPGPUSim_Context();
static unsigned next_fat_bin_handle = 1;
if(context->get_device()->get_gpgpu()->get_config().use_cuobjdump()) {
// The following workaround has only been verified on 64-bit systems.
if (sizeof(void*) == 4)
printf("GPGPU-Sim PTX: FatBin file name extraction has not been tested on 32-bit system.\n");
// FatBin handle from the .fatbin.c file (one of the intermediate files generated by NVCC)
typedef struct {int m; int v; const unsigned long long* d; char* f;} __fatDeviceText __attribute__ ((aligned (8)));
__fatDeviceText * fatDeviceText = (__fatDeviceText *) fatCubin;
// Extract the source code file name that generate the given FatBin.
// - Obtains the pointer to the actual fatbin structure from the FatBin handle (fatCubin).
// - An integer inside the fatbin structure contains the relative offset to the source code file name.
// - This offset differs among different CUDA and GCC versions.
char * pfatbin = (char*) fatDeviceText->d;
int offset = *((int*)(pfatbin+48));
char * filename = (pfatbin+16+offset);
// The extracted file name is associated with a fat_cubin_handle passed
// into cudaLaunch(). Inside cudaLaunch(), the associated file name is
// used to find the PTX/SASS section from cuobjdump, which contains the
// PTX/SASS code for the launched kernel function.
// This allows us to work around the fact that cuobjdump only outputs the
// file name associated with each section.
unsigned long long fat_cubin_handle = next_fat_bin_handle;
next_fat_bin_handle++;
printf("GPGPU-Sim PTX: __cudaRegisterFatBinary, fat_cubin_handle = %llu, filename=%s\n", fat_cubin_handle, filename);
/*!
* This function extracts all data from all files in first call
* then for next calls, only returns the appropriate number
*/
assert(fat_cubin_handle >= 1);
if (fat_cubin_handle==1) cuobjdumpInit();
cuobjdumpRegisterFatBinary(fat_cubin_handle, filename);
return (void**)fat_cubin_handle;
}else{ ... }
}
2.1. 调用关系
刚开始一波的调用关系如下:
代码方便索引,此处整理的整体关系为下面的函数调用上面首先定义的函数:
class gpgpu_functional_sim_config
{ ...
int m_ptx_use_cuobjdump;
...
}
void gpgpu_functional_sim_config::reg_options(class OptionParser * opp)
{ ...
option_parser_register(opp,
"-gpgpu_ptx_use_cuobjdump", OPT_BOOL,
&m_ptx_use_cuobjdump,
"Use cuobjdump to extract ptx and sass from binaries",
"1");//CUDART_VERSION >= 4000
...
}
gpgpu_sim *gpgpu_ptx_sim_init_perf()
{ ...
g_the_gpu_config.reg_options(opp);
...
}
class _cuda_device_id *GPGPUSim_Init()
{ ...
gpgpu_sim *the_gpu = gpgpu_ptx_sim_init_perf();
the_gpu->set_prop(prop);
the_device = new _cuda_device_id(the_gpu);
start_sim_thread(1);
...
}
void** CUDARTAPI __cudaRegisterFatBinary( void *fatCubin )
{ ...
static CUctx_st* GPGPUSim_Context()
class _cuda_device_id *GPGPUSim_Init()
CUctx_st( _cuda_device_id *gpu ) { m_gpu = gpu; }//the_context = new CUctx_st(the_gpu);
cuobjdumpInit();
cuobjdumpRegisterFatBinary(fat_cubin_handle, filename);
...
}
2.2. GPGPUSim_Context() 做了什么
2.3. 表示什么含义
GPGPUSim_Context()->get_device()->get_gpgpu()->get_config().use_cuobjdump() 表示什么含义