gpfifo是一个基础组件,channel中的FIFO基础,channel就是通过它来作为CPU和GPU之间的通道以及marker的容器
可以说CUDA中GPU和CPU沟通核心就要看着个部件。
overview
因为没有相关文档,直接看代码,核心两个fifo和fifoEntry
// GPFIFO constants
#define CU_GPFIFO_ENTRY_COUNT_LARGE 1024
#define CU_GPFIFO_ENTRY_COUNT_SMALL 8 // Bug 555146. Depth of 8 makes a WDDM desktop usable-ish when Folding@Home on a 2SM GPU.
#define CU_GPFIFO_ENTRY_COUNT_MPS_SERVER (CU_GPFIFO_ENTRY_COUNT_LARGE * 2)
#define CU_GPFIFO_SMALL_ENTRY_SM_THRESHOLD 4 // Bug 555146. Arbitrary threshold.
#define CU_GPFIFO_ENTRY_SIZE 0x8
#define CU_GPFIFO_ENTRY_PAD_COUNT 0x4
#define CU_GPFIFO_PUSHBUF_ALLOC_SIZE_SOC (1024*1024)
// GPFIFO types
typedef struct CUgpfifoConfig_st CUgpfifoConfig;
typedef struct CUgpfifoEntry_st CUgpfifoEntry;
typedef struct CUgpfifoFlushItem_st CUgpfifoFlushItem;
typedef struct CUgpfifo_st CUgpfifo;
FIFO code
先看两个辅助struct
// Config descriptor passed to GPFIFO-created
struct CUgpfifoConfig_st
{
// Number of GPFIFO entries. This will either be CU_GPFIFO_ENTRY_COUNT_LARGE
// or CU_GPFIFO_ENTRY_COUNT_SMALL.
NvU32 entryCount;
// Allocate an additional video memory pushbuffer and map it over BAR1.
NvBool allocVidmemPushbuffer:1;
// Put the gpfifo allocation in vidmem and map it over BAR1.
NvBool vidmemGpfifo:1;
// If the GPFIFO maitained by CUDA is not the GPFIFO used on the device
// which is the case for dmal where we don't do the submission directly,
// some WDDM config for instance doing the mapping is useless.
NvBool deviceMappedGpfifo:1;
// Allow the suballocation of the Memblock used to back the GPFIFO. Some
// UMD needs to reference the GPFIFO by using the only allocation handle
// making suballocation not suitable.
NvBool suballocateGpfifo:1;
// Set to true if we should release a semaphore at the
// start of each entry (in addition he end of each entry)
// - this is used to improve WDDM<->WDDM and WDDM<->TCC
// synchronization
NvBool releaseSemaphoreAtEntryStart:1;
// Size of pushbuffer to create
NvU32 pushbufferSize;
};
// GPFIFO flush structure
struct CUgpfifoFlushItem_st
{
// The flush unit that all the GPFIFO entries belong to
CUchannelFlushUnit *flushUnit;
// The GPFIFO entries to flush
NvU32 gpfifoCount;
struct {
CUgpfifo *gpfifo;
NvU32 firstEntry;
NvU32 entryCount;
} gpfifos[CU_CHANNEL_COUNT_MAX];
// Should an interrupt be triggered after competion
// of the command buffer?
NvBool awakenAfterCompletion;
// N.B, this does *not* capture which CUmemblocks need to be paged
// in for this flush
};
FIFO
对于FIFO有几个指针非常重要,gpuGet gpuPut cpuPut
另外其push buffer最后全部被entry使用到
// GPFIFO structure
struct CUgpfifo_st
{
// back-pointer to the channel which owns this GPFIFO
CUnvchannel *channel;
// configuration parameters (mostly set by driver model)
CUgpfifoConfig config;
// memory for GPFIFO
CUmemobj *gpfifoMemobj;
// pushbuffers
CUpushbuffer *sysmemPushbuffer;
CUpushbuffer *vidmemPushbuffer;
// semaphore tracking gpuPut
CUsema *sema;
// Number of entries is given by config.entryCount
CUgpfifoEntry* entries;
// first index the CPU hasn't completed writing to (this is the entry where we are pushing data)
NvU32 cpuPut;
// first index that the GPU hasn't had flushed down to it (so if gpuPut == cpuPut then all data
// except what has been written in cpuPut has been written by the CPUhas been flushed. if
// gpuPut == cpuPut and the cpuPut entry has "length 0" then all data is down to the gpu)
NvU32 gpuPut;
// first index that the GPU hasn't finished reading yet (so if gpuGet == gpuPut then the GPU has
// read everything pushed down to it)
NvU32 gpuGet;
// mechanism used to splice in segments of methods from mps clients
NvBool mpsMethodsActive;
NvU64 mpsMethodsDeviceVaddr;
NvU32 mpsMethodsCount;
};
fifoEntry
它的pushbuffer其实就是FIFO上面的pushbuffer
// GPFIFO entry structure
struct CUgpfifoEntry_st
{
// pushbuffer responsible for this gpfifo entry (if set pushbufferDevAddr must be null)
//这个其实对应的是上面的sysmemPushbuffer/vidmemPushbuffer,如果要写一个method到channal中
//那么就要对应找一个entry,该entry对应的pushbuffer就是上面的sysmemPushbuffer/vidmemPushbuffer,
CUpushbuffer *pushbuffer;
// Device side address of the pushbuffer (if set pushbuffer must be null)
NvU64 pushbufferDevAddr;
// Offset in pushbuffer
NvU64 offset;
// set via gpfifoSetSync. when true, GPFifo sync bit is set to "wait", forcing host
// to finish processing PB segment N-1 before starting to fetch segment N .
NvBool wait:1;
// bytes written to this entry thus far
NvU32 length;
// bytes reserved for this push
NvU32 spaceReserved;
// the tracking semaphore value on the channel at the end of this GPFIFO entry. once the channel
// has progressed so that its gpu-finished tracking semaphore value is equal to or above this marker,
// we know the GPFIFO entry is free.
// **n.b,** this means that we are not "done" with a GPFIFO entry until the GPU has finished
// executing/consuming all methods/data pushed in this entry, (*not* that the methods/data have
// been prefetched). in particular, it may be that the GPU will read directly from the pushbuffer
// data in a GPFIFO entry while executing this GPFIFO entry, so we cannot discard the data until the
// GPU is finished processing the entry (see teslaSemaphoreReleaseCDHack for an example of this).
NvU64 trackSemValEnd;
NvU64 trackSemValStart;
};
有关创建函数
核心是计算gpfifoSize
// initialization and tear-down
void gpfifoSetDefaultConfig(CUgpfifoConfig* config, CUctx *ctx, CUchannelType channelType);
CUresult gpfifoCreate(CUnvchannel* channel, CUgpfifoConfig* config);
void gpfifoDestroy(CUgpfifo* gpfifo);
NvU64
gpfifoCalculateTotalSize(CUmemmgr* memmgr, CUmemdesc* memdesc)
{
CU_ASSERT(memdesc->flags.type == CU_MEM_TYPE_GPFIFOBUFFER);
NvU32 numComputeChannels = 0;
NvU32 numAsyncChannels = 0;
NvU64 gpfifoEntryCount = 0;
NvU64 gpfifoSize = 0;
// Determine the number of compute and async channels
//computer Channel应该只有8个
numComputeChannels = channelManagerCalculateComputeChannelCount(memmgr->ctx);
numAsyncChannels = memmgr->ctx->device->state.asyncEngineCount * channelManagerCalculateAsyncChannelCount(memmgr->ctx);
// Get the gpfifo entryCount per compute/async channel
gpfifoEntryCount = memmgr->ctx->device->dmal.getGpfifoEntryCountForChannel(memmgr->ctx);
// Override entry count
if (globals.gpfifoEntryCountSet) {
gpfifoEntryCount = globals.gpfifoEntryCount;
}
// Round up the size based on the alignment that CUDA memory allocator will use for gpfifo
gpfifoSize = ROUND_UP((gpfifoEntryCount * CU_GPFIFO_ENTRY_SIZE), memmgr->device->hal.memblockGetHeapAlignment(memmgr, memdesc));
return gpfifoSize * (numComputeChannels + numAsyncChannels);
}
void
gpfifoSetDefaultConfig(CUgpfifoConfig* config, CUctx *ctx, CUchannelType channelType)
{
CU_TRACE_FUNCTION();
memset(config, 0, sizeof(*config));
config->vidmemGpfifo = ctx->device->state.gpfifoInVidmemSupported;
config->allocVidmemPushbuffer = NV_FALSE;
config->deviceMappedGpfifo = NV_TRUE;
config->suballocateGpfifo = NV_TRUE;
}
CUresult
gpfifoCreate(CUnvchannel* channel, CUgpfifoConfig* config)
{
CUctx* ctx;
CUgpfifo* gpfifo;
CUresult status;
size_t entriesSize;
CUmemdesc memdesc;
CU_TRACE_FUNCTION();
CU_ASSERT(config);
CU_ASSERT(channel);
CU_ASSERT(!channel->gpfifo);
ctx = channel->channelManager->ctx;
// Override entry count
if (globals.gpfifoEntryCountSet) {
config->entryCount = globals.gpfifoEntryCount;
}
// Force gpfifo location
if(globals.forceGpfifoAlloc != CUDA_FORCE_GPFIFO_DEFAULT) {
config->vidmemGpfifo = (globals.forceGpfifoAlloc == CUDA_FORCE_GPFIFO_IN_VIDMEM);
config->deviceMappedGpfifo = (globals.forceGpfifoAlloc == CUDA_FORCE_GPFIFO_IN_VIDMEM);
}
// Force pushbuffer location
if(globals.forcePushbufferAlloc != CUDA_FORCE_PUSHBUFFER_DEFAULT) {
config->allocVidmemPushbuffer = (globals.forcePushbufferAlloc == CUDA_FORCE_PUSHBUFFER_IN_VIDMEM);
}
// Allocate the GPFIFO tracking structure.
gpfifo = (CUgpfifo* )malloc(sizeof(CUgpfifo) );
if (!gpfifo) {
status = CUDA_ERROR_OUT_OF_MEMORY;
goto Error;
}
memset(gpfifo, 0, sizeof(CUgpfifo) );
gpfifo->channel = channel;
gpfifo->config = *config;
gpfifo->gpuGet = 0;
gpfifo->gpuPut = 0;
gpfifo->cpuPut = 0;
// Allocate the GPFIFO entries.
//特别注意这里的大小计算
CU_TRACE_PRINT(("GPFIFO entry count: %u\n", config->entryCount));
entriesSize = config->entryCount * sizeof(*gpfifo->entries);
gpfifo->entries = (CUgpfifoEntry *)malloc(entriesSize);
if (!gpfifo->entries) {
status = CUDA_ERROR_OUT_OF_MEMORY;
goto Error;
}
memset(gpfifo->entries, 0, entriesSize);
// Allocate the GPFIFO.
memset(&memdesc, 0x0, sizeof(memdesc));
memdesc.pChannel = channel;
if (config->vidmemGpfifo) {
memdesc.flags.location = CU_MEM_LOCATION_DEVICE;
memdesc.flags.cacheHost = CU_MEM_CACHE_HOST_DISABLED;
}
else {
memdesc.flags.location = CU_MEM_LOCATION_HOST;
memdesc.flags.cacheHost = CU_GPFIFO_AND_PUSHBUFFER_HOST_CACHE_TYPE;
}
memdesc.flags.owner = CU_MEM_OWNER_DRIVER;
memdesc.flags.type = CU_MEM_TYPE_GPFIFOBUFFER;
memdesc.flags.mapHost = CU_MEM_MAP_HOST_VA;
memdesc.flags.noSuballoc = !config->suballocateGpfifo;
if (config->deviceMappedGpfifo) {
// GPFIFO must use 40-bit VA.
memdesc.flags.mapDevice = CU_MEM_MAP_DEVICE_VA_FORCE_40_BIT;
}
else {
memdesc.flags.mapDevice = CU_MEM_MAP_DEVICE_NONE;
// If we are not going to map the GPFIFO on the device there is no need to disable or set specific caching attributes
// so simply set it the cache to enabled and override previous settings.
memdesc.flags.cacheHost = CU_MEM_CACHE_HOST_ENABLED;
}
cuiPerformanceBegin("gpfifoMemobjAlloc", CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
status = memobjAlloc(
ctx->memmgr,
&memdesc,
CU_GPFIFO_ENTRY_SIZE * config->entryCount,
&gpfifo->gpfifoMemobj);
cuiPerformanceEnd("gpfifoMemobjAlloc", CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
if (status != CUDA_SUCCESS) {
CU_DEBUG_PRINT(("Unable to allocate GPFIFO\n"));
goto Error;
}
// Allocate a pushbuffer in sysmem.
status = pushbufferCreate(ctx, config->pushbufferSize, CU_PUSHBUFFER_IN_SYSMEM, &gpfifo->sysmemPushbuffer);
if (status != CUDA_SUCCESS) {
goto Error;
}
// Allocate a pushbuffer in vidmem.
if (config->allocVidmemPushbuffer) {
status = pushbufferCreate(ctx, config->pushbufferSize, CU_PUSHBUFFER_IN_VIDMEM, &gpfifo->vidmemPushbuffer);
if (status != CUDA_SUCCESS) {
goto Error;
}
}
channel->gpfifo = gpfifo;
return CUDA_SUCCESS;
Error:
if (gpfifo) {
if (gpfifo->vidmemPushbuffer) {
pushbufferDestroy(gpfifo->vidmemPushbuffer);
}
if (gpfifo->sysmemPushbuffer) {
pushbufferDestroy(gpfifo->sysmemPushbuffer);
}
if (gpfifo->gpfifoMemobj) {
memobjFree(&gpfifo->gpfifoMemobj);
}
free(gpfifo->entries);
free(gpfifo);
}
return status;
}
void
gpfifoDestroy(CUgpfifo* gpfifo)
{
CUctx* ctx = NULL;
CU_TRACE_FUNCTION();
CU_ASSERT(gpfifo);
ctx = gpfifo->channel->channelManager->ctx;
// if there weren't any errors on this channel, then assert that the channel
// got through all of the work that was pushed.
// However before doing that we need to be sure the channel tracking semaphore
// has been fully initialized since we might hit this codepath during channel init
if (gpfifo->channel->trackingSemaphoreData.semaphore) {
gpfifoAdvanceGpuGet(gpfifo);
}
if (CUDA_SUCCESS == cuiCtxCheckError(ctx, CUI_CTX_CHECK_STICKY_ONLY)) {
CU_ASSERT(gpfifo->gpuGet == gpfifo->gpuPut && gpfifo->gpuGet == gpfifo->cpuPut);
}
if (gpfifo->vidmemPushbuffer) {
pushbufferDestroy(gpfifo->vidmemPushbuffer);
}
if (gpfifo->sysmemPushbuffer) {
pushbufferDestroy(gpfifo->sysmemPushbuffer);
}
if (gpfifo->gpfifoMemobj) {
memobjFree(&gpfifo->gpfifoMemobj);
}
free(gpfifo->entries);
free(gpfifo);
}
写数据
下面的代码时channel用来写数据的关键部分
CUSW_UNIT_CHANNEL implements a SW channel abstraction corresponding to HW channels for work submission. The CPU writes GPU methods (GPU commands) into the channel and GPU reads them from the channel. The CPU and GPU can be viewed as a producer-consumer pair talking to each other using a channel.
One channel acts as conduit of commands to one engine on the GPU (compute, memcpy, etc.). Methods submitted to a channel must be targeted for the engine that the channel is associated with. The channel’s type reflects this mapping. Multiple channels of same type are grouped into a channel pool as described later.
Each channel has a ring-buffer known as a pushbuffer. CPU writes GPU methods and data into the pushbuffer. The pointer at which data is written to by CPU is called as CPU put pointer (the tail of the pushbuffer). The pointer from which GPU reads is called as GPU get pointer (the head of the pushbuffer).
GPU methods written to a channel are taken up by GPU for execution in the order in which they were written. Work completion however may occur out of order for some types of channels. In addition, methods submitted in different channels may start executing in parallel.
GPU methods and data written to the channel’s pushbuffer may be buffered by the driver and sent to GPU in batches. Sending such buffered contents to GPU is termed as a channel flush. Clients of CUSW_UNIT_CHANNEL must explicitly ask the channel to flush its contents to ensure that the methods and data written to the channel will eventually be seen by the GPU. For instance, when busy-waiting on the CPU for a GPU action to take place (e.g. waiting for the completion of a kernel launch), the channel has to be flushed before starting the wait. Otherwise the GPU may never execute the action and the CPU thread waiting for it will deadlock.
/**
* Give a set of push flags, return the appropriate pushbuffer object to start a push.
*/
CUpushbuffer*
gpfifoGetPushbuffer(CUgpfifo *gpfifo, FLAG_SET(CUIpushFlags) flags)
{
NvBool preferSysmemPushbuffer = (flags & CUI_PUSH_PREFER_SYSMEM_PUSHBUFFER) != 0;
CUpushbuffer *pushbuffer;
CU_ASSERT(gpfifo);
// If we have allocated two pushbuffers, then obey the push preference flags.
// Assuming that pushbuffer location is not forced using the env var.
// Otherwise,we must always use pinned system memory pushbuffers.
if (preferSysmemPushbuffer &&
(globals.forcePushbufferAlloc == CUDA_FORCE_PUSHBUFFER_DEFAULT)) {
pushbuffer = gpfifo->sysmemPushbuffer;
}
else {
if (gpfifo->vidmemPushbuffer) {
pushbuffer = gpfifo->vidmemPushbuffer;
}
else {
pushbuffer = gpfifo->sysmemPushbuffer;
}
}
return pushbuffer;
}
static NvBool
gpfifoHasPushbufferSpace(CUgpfifo* gpfifo, NvU32 spaceNeeded, FLAG_SET(CUIpushFlags) flags)
{
NvU32 i;
CUpushbuffer *pushbuffer;
CU_TRACE_FUNCTION();
CU_ASSERT(gpfifo);
CU_ASSERT(gpfifo->cpuPut <= gpfifo->config.entryCount);
CU_ASSERT(gpfifo->gpuPut <= gpfifo->config.entryCount);
// Get pushbuffer.
pushbuffer = gpfifoGetPushbuffer(gpfifo, flags);
// potentially perform the computation twice because updating the gpu get
// can be expensive in the same manner as in gpfifoHasFreeEntries
for (i = 0; i < 2; ++i) {
// update the gpu get pointer only the second time around
if (i == 1) {
gpfifoAdvanceGpuGet(gpfifo);
}
if (pushbufferHasSpace(pushbuffer, spaceNeeded)) {
return NV_TRUE;
}
}
return NV_FALSE;
}
static void
gpfifoFlushGpfifoEntry(CUgpfifo *gpfifo)
{
NvU32 cpuPutIndex;
CUgpfifoEntry *cpuPutEntry;
CUctx *ctx;
CU_TRACE_FUNCTION();
CU_ASSERT(gpfifo);
ctx = gpfifo->channel->channelManager->ctx;
// Get the current put index.
cpuPutIndex = gpfifo->cpuPut;
cpuPutEntry = gpfifo->entries + cpuPutIndex;
// Splice in additional gpfifo entry for mps if needed.
// - move "cpuPutEntry" forward by one
// - add the supplied methods in the space
if (gpfifo->mpsMethodsActive) {
// preCpuPutIndex is the GPFIFO index where we'll be pointing to the
// spliced methods
NvU32 preCpuPutIndex = cpuPutIndex;
CUgpfifoEntry *preCpuPutEntry = cpuPutEntry;
// Assert that CPU architecture is not ARMv7.
// Mps is currently not supported for ARM
CU_ASSERT(!cuiArchIsARMv7());
// cpuPutIndex is the push that's currently being filled, move it
// forward by one, and copy its old data over
cpuPutIndex = (gpfifo->cpuPut + 1) % gpfifo->config.entryCount;
cpuPutEntry = gpfifo->entries + cpuPutIndex;
memcpy(cpuPutEntry, preCpuPutEntry, sizeof(CUgpfifoEntry));
// mark that the "pre" entry will be completed when the original cpu put entry
// is done (by setting its trackSemValEnd to the current value)
preCpuPutEntry->offset = 0;
preCpuPutEntry->pushbuffer = NULL;
preCpuPutEntry->length = 0;
preCpuPutEntry->trackSemValStart = 0;
preCpuPutEntry->trackSemValEnd = trackingSemaGetLastReleaseValue(&gpfifo->channel->trackingSemaphoreData);
// build the gpfifo entry bits based on the spliced entry
ctx->device->hal.setGpfifoEntry(
(NvU32 *)gpfifoEntryCpuPointer(gpfifo, preCpuPutIndex),
gpfifo->mpsMethodsDeviceVaddr,
gpfifo->mpsMethodsCount,
globals.forceGpfifoWait);
// clear the state
gpfifo->mpsMethodsActive = NV_FALSE;
gpfifo->mpsMethodsDeviceVaddr = 0;
gpfifo->mpsMethodsCount = 0;
// Update cpu put index
gpfifo->cpuPut = cpuPutIndex;
}
// Report cpuPut, gpuPut as updated in GPU host
cuiToolsNotifyFifoCpuPutStart(gpfifo, cpuPutEntry);
// Write out cpu put to the gpfifo.
CU_ASSERT(cpuPutEntry->pushbuffer || cpuPutEntry->pushbufferDevAddr);
ctx->device->hal.setGpfifoEntry(
(NvU32*)gpfifoEntryCpuPointer(gpfifo, cpuPutIndex),
(cpuPutEntry->pushbuffer ? pushbufferGetDeviceVaddr(cpuPutEntry->pushbuffer) : cpuPutEntry->pushbufferDevAddr) + cpuPutEntry->offset,
cpuPutEntry->length,
globals.forceGpfifoWait || cpuPutEntry->wait);
// Report cpuPut, gpuPut as updated in GPU host
cuiToolsNotifyFifoCpuPutEnd(gpfifo, cpuPutEntry);
// Get the tracking semaphore value at which this gpfifo entry will be complete.
cpuPutEntry->trackSemValEnd = trackingSemaGetLastReleaseValue(&gpfifo->channel->trackingSemaphoreData);
CU_ASSERT(cpuPutEntry->trackSemValStart <= cpuPutEntry->trackSemValEnd);
}
static void
gpfifoAdvanceCpuPut(CUgpfifo* gpfifo)
{
NvU32 newCpuPutIndex;
CUgpfifoEntry *newCpuPutEntry;
CU_TRACE_FUNCTION();
// Compute new cpu put index.
newCpuPutIndex = (gpfifo->cpuPut + 1) % gpfifo->config.entryCount;
newCpuPutEntry = gpfifo->entries + newCpuPutIndex;
// Advance cpu put.
gpfifo->cpuPut = newCpuPutIndex;
// Clear new entry.
memset(newCpuPutEntry, 0, sizeof(*newCpuPutEntry));
}
void
gpfifoAdvanceGpuGet(CUgpfifo* gpfifo)
{
// Read the tracking sema value once and use it for the rest of the function
CUtrackingSemaData *semaData = &gpfifo->channel->trackingSemaphoreData;
NvU64 trackingSemaValue =
trackingSemaDataUpdateAndGetLastCompleted(semaData, trackingSemaGetLastCompletedValue(semaData));
CUgpfifoEntry* entry = NULL;
CUgpfifoEntry* lastCompletedEntry = NULL;
NvU32 start = gpfifo->gpuGet;
NvU32 end = gpfifo->gpuPut;
CU_TRACE_FUNCTION();
// Quick check to see if we have wrap-around. We can then choose the wrap-around
// point as a midpoint and eliminate a bunch of entries (either to the left or to the right)
if (end < start) {
entry = gpfifo->entries + (gpfifo->config.entryCount - 1);
// If its not complete, we don't need to see past this point (into the wrap-around)
if (!trackingSemaValueIsComplete(trackingSemaValue, entry->trackSemValEnd)) {
end = gpfifo->config.entryCount - 1;
}
else {
// Otherwise its complete, we can discard everything before and start at the wrap-around point
lastCompletedEntry = entry;
start = 0;
}
}
// Look to see which entries are completed
// If entry 'i + 1' is complete, this implies entry 'i' is complete
while (start < end) {
entry = gpfifo->entries + start;
if (!trackingSemaValueIsComplete(trackingSemaValue, entry->trackSemValEnd)) {
break;
}
lastCompletedEntry = entry;
start++;
}
// Update pushbuffer at once (by setting 'get' directly) instead of entry by entry
if (lastCompletedEntry && lastCompletedEntry->pushbuffer) {
CU_ASSERT(lastCompletedEntry->trackSemValEnd > 0);
CU_ASSERT(trackingSemaHasCompletedValue(semaData, lastCompletedEntry->trackSemValEnd));
// MPS server can push entries without a PB associated to track client submissions
// These entries are spliced in and paired with the actual client submission,
// 'paired' meaning they both have the same tracking sema release value.
// So if one is done, the other is as well, therefore we should never have a case where the
// entry doesn't have a pushbuffer, since the one WITH the pushbuffer (server-side) would be done as well.
pushbufferSetGet(lastCompletedEntry->pushbuffer, (NvU32)(lastCompletedEntry->offset + lastCompletedEntry->length));
gpfifo->gpuGet = start;
}
}
/*
*
* GPFIFO and pushbuffer offset and pointer accessors
*
*/
NvU64
gpfifoEntryOffset(CUgpfifo* gpfifo, NvU32 entry)
{
return memobjGetDeviceVaddr(gpfifo->gpfifoMemobj) + entry * CU_GPFIFO_ENTRY_SIZE;
}
void*
gpfifoEntryCpuPointer(CUgpfifo* gpfifo, NvU32 entry)
{
return (void*)((char*)memobjGetHostPtr(gpfifo->gpfifoMemobj) + entry * CU_GPFIFO_ENTRY_SIZE);
}
/*
*
* channel DMAL methods for DMs that use GPFIFOs
*
*/
NvBool
channelCanAdvanceGPFIFO(CUnvchannel *channel, NvU32 spaceRequested, FLAG_SET(CUIpushFlags) flags)
{
CUchannelFlushUnit *channelFlushUnit = channel->channelFlushUnit;
CUgpfifo *gpfifo;
NvU32 entryCountRequired = CU_GPFIFO_ENTRY_PAD_COUNT;
CU_TRACE_FUNCTION();
// Get current gpfifo entry
gpfifo = channel->gpfifo;
channelFlushUnit = channel->channelFlushUnit;
// This can't be queried during a push
CU_ASSERT(!channel->channelManager->currentPush.active);
// Verify that we have spaceRequested bytes available to write.
if (!gpfifoHasPushbufferSpace(gpfifo, spaceRequested, flags)) {
return NV_FALSE;
}
// Verify that we have a few GPFIFO entries to cut as well.
if (!gpfifoHasFreeEntries(gpfifo, entryCountRequired)) {
return NV_FALSE;
}
// If this is a slave channel in TSG, it also needs to verify that the master channel, which we will
// have to push a join with this channel when this channel flushes, has enough GPFIFO space.
// Please notice that this will also get executed on AModel, which is not necessary at this point,
// but if at any point we make AModel asynchronously, then this might be needed.
if (channel != channelFlushUnit->channelHead) {
if (!gpfifoHasFreeEntries(channelFlushUnit->channelHead->gpfifo, entryCountRequired)) {
return NV_FALSE;
}
}
return NV_TRUE;
}
/**
* Get a pointer to the start of the pushbuffer associated with this gpfifo entry.
*/
static CUnvCurrent*
gpfifoEntryGetPushbufferStart(CUgpfifoEntry *entry)
{
void *pushbufferBaseAddr;
CU_ASSERT(entry);
pushbufferBaseAddr = pushbufferGetHostVaddr(entry->pushbuffer);
return (CUnvCurrent*)((uintptr_t)pushbufferBaseAddr + (NvU32)entry->offset);
}
/**
* Get a pointer to the end of the pushbuffer associated with this gpfifo entry, where new methods can be written.
*/
static CUnvCurrent*
gpfifoEntryGetPushbufferEnd(CUgpfifoEntry *entry)
{
CU_ASSERT(entry);
return (CUnvCurrent*)((uintptr_t)gpfifoEntryGetPushbufferStart(entry) + entry->length);
}
NvBool
channelCanContinueGPFIFO(CUnvchannel *channel, CUnvCurrent *nvCurrent, NvU32 additionalBytesRequested, FLAG_SET(CUIpushFlags) flags)
{
CUgpfifo *gpfifo = channel->gpfifo;
CUgpfifoEntry *cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;
CUpushbuffer *pushbuffer = gpfifoGetPushbuffer(gpfifo, flags);
NvU32 spaceAlreadyUsed = (NvU32)((char*)nvCurrent - (char*)gpfifoEntryGetPushbufferStart(cpuPutEntry));
NvBool canContinue = pushbufferHasSpaceToContinueToTotalSize(pushbuffer, additionalBytesRequested + spaceAlreadyUsed);
if (canContinue) {
cpuPutEntry->spaceReserved = additionalBytesRequested + spaceAlreadyUsed;
}
return canContinue;
}
void
channelGetPutPointerGPFIFO(CUnvchannel *channel, CUnvCurrent **pnvCurrent, NvU32 spaceRequested, FLAG_SET(CUIpushFlags) flags)
{
CUgpfifo *gpfifo;
CUgpfifoEntry *cpuPutEntry;
CUpushbuffer *pushbuffer;
NvBool startNewPushbuffer = NV_FALSE;
CU_TRACE_FUNCTION();
CU_ASSERT(channel && channel->gpfifo);
// Get the current gpfifo entry.
gpfifo = channel->gpfifo;
cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;
// We should never run out of GPFIFO entries at this point CanAdvance won't return
// true until there are a few spare "padding" entries.
CU_ASSERT(gpfifoHasFreeEntries(gpfifo, CU_GPFIFO_ENTRY_PAD_COUNT));
CU_ASSERT(gpfifoHasPushbufferSpace(gpfifo, spaceRequested, flags));
startNewPushbuffer = !cpuPutEntry->pushbuffer;
// If we are doing a raw submission we have to have our own pushbuffer
if (!startNewPushbuffer && (flags & CUI_PUSH_RAW))
{
gpfifoFlushGpfifoEntry(gpfifo);
gpfifoAdvanceCpuPut(gpfifo);
}
CU_ASSERT(spaceRequested > 0 || (flags & CUI_PUSH_RAW));
// Initialize the cpu put entry, if needed.
if (startNewPushbuffer) {
if (spaceRequested == 0) {
// The caller has not asked us to allocate a pushbuffer for this entry
// therefore we assume it will be provided by the user
cpuPutEntry->offset = 0;
cpuPutEntry->pushbuffer = NULL;
cpuPutEntry->length = 0;
cpuPutEntry->spaceReserved = 0;
cpuPutEntry->trackSemValStart = 0;
cpuPutEntry->trackSemValEnd = 0;
}
else {
// Get pushbuffer.
pushbuffer = gpfifoGetPushbuffer(gpfifo, flags);
// Fill out new cpu put entry.
cpuPutEntry->offset = pushbufferStartPush(pushbuffer, spaceRequested);
cpuPutEntry->pushbuffer = pushbuffer;
cpuPutEntry->length = 0;
cpuPutEntry->spaceReserved = spaceRequested;
cpuPutEntry->trackSemValStart = 0;
cpuPutEntry->trackSemValEnd = 0;
}
}
else {
// We have reserved spaceRequested space + size of bytes we queued up in this entry already
cpuPutEntry->spaceReserved = cpuPutEntry->length + spaceRequested;
}
// Get the current pushbuffer pointer.
if (pnvCurrent) {
if (spaceRequested == 0) {
*pnvCurrent = NULL;
}
else {
*pnvCurrent = gpfifoEntryGetPushbufferEnd(cpuPutEntry);
}
}
}
void
channelSetPutPointerGPFIFO(CUnvchannel* channel, CUnvCurrent* nvCurrent, NvBool *needsFlush)
{
CUgpfifo *gpfifo;
CUgpfifoEntry *cpuPutEntry;
NvU32 length;
const NvU32 maxCpuPutLength = channel->maxQueueLength + CU_PUSHBUF_MAX_PUSH_SIZE_DEFAULT;
CU_TRACE_FUNCTION();
gpfifo = channel->gpfifo;
// Compute the new CPU put length
cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;
// If this entry is not associated to any pushbuffer space we have nothing to do but writting this entry
if (!cpuPutEntry->pushbuffer) {
gpfifoFlushGpfifoEntry(gpfifo);
gpfifoAdvanceCpuPut(gpfifo);
return;
}
length = (NvU32)((char*)nvCurrent - (char*)gpfifoEntryGetPushbufferStart(cpuPutEntry));
// Inform the pushbuffer how much more space has been written to since the last channelGetPutPointer().
pushbufferEndPush(cpuPutEntry->pushbuffer, length - cpuPutEntry->length);
// Set the length of the current push.
cpuPutEntry->length = length;
// Report cpuPut, gpuPut as updated in GPU host
// At this point if we made call back into the driver we could
// replace pushbuffer with user defined pushbuffer.
cuiToolsNotifyFifoPushbufferCreate(gpfifo, length, cpuPutEntry, channel);
// verify we wrote a multiple of 4 bytes
CU_ASSERT(cpuPutEntry->length % sizeof(NvU32) == 0);
// verify that we haven't written off the end of this pushbuffer allocation
CU_ASSERT(cpuPutEntry->length <= cpuPutEntry->spaceReserved);
// If we're allowed to queue this, do so.
{
NvBool queuingLimitReached = cpuPutEntry->length >= channel->maxQueueLength;
NvU64 pushbufferFreeSpace = pushbufferGetSize(cpuPutEntry->pushbuffer) - cpuPutEntry->pushbuffer->put;
NvBool pushbufferTooFull = (pushbufferFreeSpace < CU_PUSHBUF_MAX_PUSH_SIZE_DEFAULT);
*needsFlush = (queuingLimitReached || pushbufferTooFull);
}
}
CUresult
channelFlushUnitFlushGPFIFO(CUchannelFlushUnit *channelFlushUnit)
{
CUresult status = CUDA_SUCCESS;
CUnvchannel* channel = NULL;
CUgpfifoFlushItem flushItem;
CUgpfifoEntry *cpuPutEntry;
CUgpfifo *gpfifo;
CU_TRACE_FUNCTION();
CU_ASSERT(channelFlushUnit);
flushItem.flushUnit = channelFlushUnit;
flushItem.gpfifoCount = 0;
flushItem.awakenAfterCompletion = NV_FALSE;
// Iterate through all channels within the same channel flush unit
// to cut a new GPFIFO entry with any work needed
for (channel = channelFlushUnit->channelHead; channel; channel = channel->flushUnitNext) {
NvU32 entryCount = 0;
gpfifo = channel->gpfifo;
// Compute new cpu put index.
cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;
if (cpuPutEntry->length == 0) {
continue;
}
// Write out the current cpu put to the gpfifo.
gpfifoFlushGpfifoEntry(gpfifo);
// Advance the cpu put pointer.
gpfifoAdvanceCpuPut(gpfifo);
// Populate the flush structure with the entries to flush.
entryCount = (gpfifo->cpuPut >= gpfifo->gpuPut) ?
(gpfifo->cpuPut - gpfifo->gpuPut) :
(gpfifo->config.entryCount - gpfifo->gpuPut + gpfifo->cpuPut);
if (entryCount == 0) {
continue;
}
// Populate the flush item
flushItem.gpfifos[flushItem.gpfifoCount].gpfifo = gpfifo;
flushItem.gpfifos[flushItem.gpfifoCount].firstEntry = gpfifo->gpuPut;
flushItem.gpfifos[flushItem.gpfifoCount].entryCount = entryCount;
flushItem.gpfifoCount += 1;
// Track if an awaken is expected after the command buffer finishes
if (channel->blockingSync.unflushedAwaken) {
CU_ASSERT(!channel->blockingSync.useInterrupt);
channel->blockingSync.unflushedAwaken = NV_FALSE;
flushItem.awakenAfterCompletion = NV_TRUE;
}
// Mark all work as having been flushed
trackingSemaFlushLastReleaseValue(&channel->trackingSemaphoreData);
gpfifo->gpuPut = gpfifo->cpuPut;
}
// If we have nothing to flush, early-out
if (flushItem.gpfifoCount == 0) {
return CUDA_SUCCESS;
}
// If pushbuffer-dumping is enabled, write this 'flushitem' to the
// pushbuffer dump file.
if (globals.pbdump.enable) {
pushbufferDumpWriteGpfifoFlushItem(globals.pbdump.file, &flushItem);
}
// The main goal of this callbacks is to provide time when we perform GPU Put update.
// For those "rare" dmals that update GPU Put in shot for several pushbuffers this will
// clearly report only last pushbuffer. But it still serves its purpose.
cuiToolsNotifyFifoGpuPutStart(gpfifo, cpuPutEntry);
// Pass the flush structure down to the GPFIFO DM layer
status = channelFlushUnit->channelHead->dmal.GpfifoAdvanceGpuPut(&flushItem);
cuiToolsNotifyFifoGpuPutEnd(gpfifo, cpuPutEntry);
return status;
}
CUnvCurrent*
gpfifoSpliceInExternalBuffers(
CUnvCurrent* nvCurrent,
CUnvchannel* channel,
NvU32 numExternalBuffers,
NvU32* spliceInOffsets,
NvU32* buffer,
NvU32 bufferSize,
NvU64* externalBufferAddresses,
NvU32* externalBufferSizes)
{
NvU32 i = 0;
for (i = 0; i < numExternalBuffers + 1; i++) {
NvU32 begin = (i == 0 ? 0 : spliceInOffsets[i - 1] + externalBufferSizes[i - 1]);
NvU32 end = (i == numExternalBuffers ? bufferSize : spliceInOffsets[i]);
if (begin >= end) {
break;
}
CU_DEBUG_PRINT(("Splicing an %d bytes external buffer at address 0x%llu at offset %d\n",
externalBufferSizes[i], externalBufferAddresses[i], end));
// 1. Copy the buffer containing the methods to the current pushbuffer from "begin" to "end"
memQuickCopyNontemporal(nvCurrent, buffer + (begin / sizeof(NvU32)), end - begin);
nvCurrent += (end - begin) / sizeof(NvU32);
if (i == numExternalBuffers) {
break;
}
// End the current submission
channelEndPushInternal_UnderLock(channel, nvCurrent, NV_FALSE, CUI_END_PUSH_RAW);
// 2. Splice in a new gpfifo entry
channelPushExternalPushbuffer_UnderLock(channel, NV_TRUE, externalBufferAddresses[i], externalBufferSizes[i]);
// 3. Restart the submission
channelBeginPushInternal_UnderLock(channel, &nvCurrent, CU_PUSHBUF_MAX_PUSH_SIZE_DEFAULT, CUI_PUSH_RAW);
}
return nvCurrent;
}
static NV_INLINE void cuiToolsNotifyFifoCpuPutEnd(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
(void)gpfifo;
(void)cpuPutEntry;
#else
if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_END))
{
CUtoolsFifoCpuPutEnd Params = {0};
Params.struct_size = sizeof(Params);
Params.ctx = gpfifo->channel->channelManager->ctx;
if (cpuPutEntry->length)
Params.pushBuffer = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
Params.pushBufferLength = cpuPutEntry->length;
Params.channelID = gpfifo->channel->chID;
Params.channelType = channelGetType(gpfifo->channel);
Params.cpuPut = gpfifo->cpuPut;
Params.gpuPut = gpfifo->gpuPut;
Params.gpuGet = gpfifo->gpuGet;
Params.tsgGroup = gpfifo->channel->channelFlushUnit->tsgGroupID;
toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_END, (void *)&Params);
}
#endif
}
static NV_INLINE void cuiToolsNotifyFifoCpuPutStart(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
(void)gpfifo;
(void)cpuPutEntry;
#else
if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_START))
{
CUtoolsFifoCpuPutStart Params = {0};
Params.struct_size = sizeof(Params);
Params.ctx = gpfifo->channel->channelManager->ctx;
if (cpuPutEntry->length)
Params.pushBuffer = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
Params.pushBufferLength = cpuPutEntry->length;
Params.channelID = gpfifo->channel->chID;
Params.channelType = channelGetType(gpfifo->channel);
Params.cpuPut = (gpfifo->cpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
Params.gpuPut = gpfifo->gpuPut;
Params.gpuGet = gpfifo->gpuGet;
Params.tsgGroup = gpfifo->channel->channelFlushUnit->tsgGroupID;
toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_START, (void *)&Params);
}
#endif
}
static NV_INLINE void cuiToolsNotifyFifoGpuPutEnd(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
(void)gpfifo;
(void)cpuPutEntry;
#else
if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_END))
{
CUtoolsFifoGpuPutEnd Params = {0};
Params.struct_size = sizeof(Params);
Params.ctx = gpfifo->channel->channelManager->ctx;
if (cpuPutEntry->length)
Params.pushBuffer = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
Params.pushBufferLength = cpuPutEntry->length;
Params.channelID = gpfifo->channel->chID;
Params.channelType = channelGetType(gpfifo->channel);
Params.cpuPut = (gpfifo->cpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;;
Params.gpuPut = gpfifo->gpuPut;
Params.gpuGet = gpfifo->gpuGet;
Params.tsgGroup = gpfifo->channel->channelFlushUnit->tsgGroupID;
toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_END, (void *)&Params);
}
#endif
}
static NV_INLINE void cuiToolsNotifyFifoGpuPutStart(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
(void)gpfifo;
(void)cpuPutEntry;
#else
if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_START))
{
CUtoolsFifoGpuPutStart Params = {0};
Params.struct_size = sizeof(Params);
Params.ctx = gpfifo->channel->channelManager->ctx;
if (cpuPutEntry->length)
Params.pushBuffer = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
Params.pushBufferLength = cpuPutEntry->length;
Params.channelID = gpfifo->channel->chID;
Params.channelType = channelGetType(gpfifo->channel);
Params.cpuPut = (gpfifo->cpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
Params.gpuPut = (gpfifo->gpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
Params.gpuGet = gpfifo->gpuGet;
Params.tsgGroup = gpfifo->channel->channelFlushUnit->tsgGroupID;
toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_START, (void *)&Params);
}
#endif
}
static NV_INLINE void cuiToolsNotifyFifoPushbufferCreate(CUgpfifo *gpfifo, NvU32 length, CUgpfifoEntry *cpuPutEntry, CUnvchannel *channel)
{
#if !TOOLS_ENABLED
(void)gpfifo;
(void)length;
(void)cpuPutEntry;
(void)channel;
#else
if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_PUSHBUFFER_CREATE))
{
CUtoolsFifoPushbufferCreate Params = {0};
Params.struct_size = sizeof(Params);
Params.ctx = gpfifo->channel->channelManager->ctx;
if (length)
Params.pushBuffer = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
Params.pushBufferLength = length;
Params.channelID = gpfifo->channel->chID;
Params.channelType = channelGetType(gpfifo->channel);
Params.cpuPut = (gpfifo->cpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
Params.gpuPut = gpfifo->gpuPut;
Params.gpuGet = gpfifo->gpuGet;
Params.tsgGroup = channel->channelFlushUnit->tsgGroupID;
toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_PUSHBUFFER_CREATE, (void *)&Params);
}
#endif
}
238

被折叠的 条评论
为什么被折叠?



