CUDA系列-GPFIFO-5

gpfifo是一个基础组件,channel中的FIFO基础,channel就是通过它来作为CPU和GPU之间的通道以及marker的容器
可以说CUDA中GPU和CPU沟通核心就要看着个部件。


overview

因为没有相关文档,直接看代码,核心两个fifo和fifoEntry

// GPFIFO constants
#define CU_GPFIFO_ENTRY_COUNT_LARGE         1024
#define CU_GPFIFO_ENTRY_COUNT_SMALL         8       // Bug 555146. Depth of 8 makes a WDDM desktop usable-ish when Folding@Home on a 2SM GPU.
#define CU_GPFIFO_ENTRY_COUNT_MPS_SERVER    (CU_GPFIFO_ENTRY_COUNT_LARGE * 2)
#define CU_GPFIFO_SMALL_ENTRY_SM_THRESHOLD  4       // Bug 555146. Arbitrary threshold.
#define CU_GPFIFO_ENTRY_SIZE                0x8
#define CU_GPFIFO_ENTRY_PAD_COUNT           0x4

#define CU_GPFIFO_PUSHBUF_ALLOC_SIZE_SOC             (1024*1024)

// GPFIFO types
typedef struct CUgpfifoConfig_st    CUgpfifoConfig;
typedef struct CUgpfifoEntry_st     CUgpfifoEntry;
typedef struct CUgpfifoFlushItem_st CUgpfifoFlushItem;
typedef struct CUgpfifo_st          CUgpfifo;

FIFO code

先看两个辅助struct

// Config descriptor passed to GPFIFO-created
struct CUgpfifoConfig_st
{
    // Number of GPFIFO entries. This will either be CU_GPFIFO_ENTRY_COUNT_LARGE
    // or CU_GPFIFO_ENTRY_COUNT_SMALL.
    NvU32 entryCount;

    // Allocate an additional video memory pushbuffer and map it over BAR1.
    NvBool allocVidmemPushbuffer:1;

    // Put the gpfifo allocation  in vidmem and map it over BAR1.
    NvBool vidmemGpfifo:1;

    // If the GPFIFO maitained by CUDA is not the GPFIFO used on the device
    // which is the case for dmal where we don't do the submission directly,
    // some WDDM config for instance doing the mapping is useless.
    NvBool deviceMappedGpfifo:1;

    // Allow the suballocation of the Memblock used to back the GPFIFO. Some
    // UMD needs to reference the GPFIFO by using the only allocation handle
    // making suballocation not suitable.
    NvBool suballocateGpfifo:1;

    // Set to true if we should release a semaphore at the
    // start of each entry (in addition he end of each entry)
    // - this is used to improve WDDM<->WDDM and WDDM<->TCC 
    //   synchronization
    NvBool releaseSemaphoreAtEntryStart:1;

    // Size of pushbuffer to create
    NvU32 pushbufferSize;
};

// GPFIFO flush structure
struct CUgpfifoFlushItem_st
{
    // The flush unit that all the GPFIFO entries belong to
    CUchannelFlushUnit *flushUnit;

    // The GPFIFO entries to flush
    NvU32 gpfifoCount;
    struct {
        CUgpfifo *gpfifo;
        NvU32 firstEntry;
        NvU32 entryCount;
    } gpfifos[CU_CHANNEL_COUNT_MAX];

    // Should an interrupt be triggered after competion
    // of the command buffer?
    NvBool awakenAfterCompletion;

    // N.B, this does *not* capture which CUmemblocks need to be paged
    // in for this flush
};

FIFO

对于FIFO有几个指针非常重要,gpuGet gpuPut cpuPut
另外其push buffer最后全部被entry使用到

// GPFIFO structure
struct CUgpfifo_st
{
    // back-pointer to the channel which owns this GPFIFO
    CUnvchannel  *channel;

    // configuration parameters (mostly set by driver model)
    CUgpfifoConfig config;

    // memory for GPFIFO
    CUmemobj *gpfifoMemobj;

    // pushbuffers
    CUpushbuffer *sysmemPushbuffer;
    CUpushbuffer *vidmemPushbuffer;

    // semaphore tracking gpuPut
    CUsema *sema;

    // Number of entries is given by config.entryCount
    CUgpfifoEntry* entries;

    // first index the CPU hasn't completed writing to (this is the entry where we are pushing data)
    NvU32 cpuPut;

    // first index that the GPU hasn't had flushed down to it (so if gpuPut == cpuPut then all data
    // except what has been written in cpuPut has been written by the CPUhas been flushed.  if
    // gpuPut == cpuPut and the cpuPut entry has "length 0" then all data is down to the gpu)
    NvU32 gpuPut;

    // first index that the GPU hasn't finished reading yet (so if gpuGet == gpuPut then the GPU has
    // read everything pushed down to it)
    NvU32 gpuGet;

    // mechanism used to splice in segments of methods from mps clients
    NvBool mpsMethodsActive;
    NvU64 mpsMethodsDeviceVaddr;
    NvU32 mpsMethodsCount;
};

fifoEntry

它的pushbuffer其实就是FIFO上面的pushbuffer

// GPFIFO entry structure
struct CUgpfifoEntry_st
{
    // pushbuffer responsible for this gpfifo entry (if set pushbufferDevAddr must be null)
    //这个其实对应的是上面的sysmemPushbuffer/vidmemPushbuffer,如果要写一个method到channal中
    //那么就要对应找一个entry,该entry对应的pushbuffer就是上面的sysmemPushbuffer/vidmemPushbuffer,
    CUpushbuffer *pushbuffer;
    // Device side address of the pushbuffer (if set pushbuffer must be null)
    NvU64 pushbufferDevAddr;

    // Offset in pushbuffer
    NvU64 offset;

    // set via gpfifoSetSync. when true, GPFifo sync bit is set to "wait", forcing host
    // to finish processing PB segment N-1 before starting to fetch segment N .
    NvBool wait:1;

    // bytes written to this entry thus far
    NvU32 length;

    // bytes reserved for this push
    NvU32 spaceReserved;

    // the tracking semaphore value on the channel at the end of this GPFIFO entry.  once the channel
    // has progressed so that its gpu-finished tracking semaphore value is equal to or above this marker,
    // we know the GPFIFO entry is free.
    // **n.b,** this means that we are not "done" with a GPFIFO entry until the GPU has finished
    // executing/consuming all methods/data pushed in this entry, (*not* that the methods/data have
    // been prefetched).  in particular, it may be that the GPU will read directly from the pushbuffer
    // data in a GPFIFO entry while executing this GPFIFO entry, so we cannot discard the data until the
    // GPU is finished processing the entry (see teslaSemaphoreReleaseCDHack for an example of this).
    NvU64 trackSemValEnd;
    NvU64 trackSemValStart;
};

有关创建函数

核心是计算gpfifoSize

// initialization and tear-down
void gpfifoSetDefaultConfig(CUgpfifoConfig* config, CUctx *ctx, CUchannelType channelType);
CUresult gpfifoCreate(CUnvchannel* channel, CUgpfifoConfig* config);
void gpfifoDestroy(CUgpfifo* gpfifo);

NvU64
gpfifoCalculateTotalSize(CUmemmgr* memmgr, CUmemdesc* memdesc)
{
    CU_ASSERT(memdesc->flags.type == CU_MEM_TYPE_GPFIFOBUFFER);

    NvU32 numComputeChannels = 0;
    NvU32 numAsyncChannels = 0;

    NvU64 gpfifoEntryCount = 0;
    NvU64 gpfifoSize = 0;

    // Determine the number of compute and async channels
    //computer Channel应该只有8个
    numComputeChannels = channelManagerCalculateComputeChannelCount(memmgr->ctx);
    numAsyncChannels = memmgr->ctx->device->state.asyncEngineCount * channelManagerCalculateAsyncChannelCount(memmgr->ctx);

    // Get the gpfifo entryCount per compute/async channel
    gpfifoEntryCount = memmgr->ctx->device->dmal.getGpfifoEntryCountForChannel(memmgr->ctx);

    // Override entry count
    if (globals.gpfifoEntryCountSet) {
        gpfifoEntryCount = globals.gpfifoEntryCount;
    }

    // Round up the size based on the alignment that CUDA memory allocator will use for gpfifo
    gpfifoSize = ROUND_UP((gpfifoEntryCount * CU_GPFIFO_ENTRY_SIZE), memmgr->device->hal.memblockGetHeapAlignment(memmgr, memdesc));

    return gpfifoSize * (numComputeChannels + numAsyncChannels);
}

void
gpfifoSetDefaultConfig(CUgpfifoConfig* config, CUctx *ctx, CUchannelType channelType)
{
    CU_TRACE_FUNCTION();

    memset(config, 0, sizeof(*config));
    config->vidmemGpfifo = ctx->device->state.gpfifoInVidmemSupported;
    config->allocVidmemPushbuffer = NV_FALSE;
    config->deviceMappedGpfifo = NV_TRUE;
    config->suballocateGpfifo = NV_TRUE;
}

CUresult
gpfifoCreate(CUnvchannel* channel, CUgpfifoConfig* config)
{
    CUctx* ctx;
    CUgpfifo* gpfifo;
    CUresult status;
    size_t entriesSize;
    CUmemdesc memdesc;

    CU_TRACE_FUNCTION();
    CU_ASSERT(config);
    CU_ASSERT(channel);
    CU_ASSERT(!channel->gpfifo);

    ctx = channel->channelManager->ctx;

    // Override entry count
    if (globals.gpfifoEntryCountSet) {
        config->entryCount = globals.gpfifoEntryCount;
    }

    // Force gpfifo location
    if(globals.forceGpfifoAlloc != CUDA_FORCE_GPFIFO_DEFAULT) {
        config->vidmemGpfifo = (globals.forceGpfifoAlloc == CUDA_FORCE_GPFIFO_IN_VIDMEM);
        config->deviceMappedGpfifo = (globals.forceGpfifoAlloc == CUDA_FORCE_GPFIFO_IN_VIDMEM);
    }

    // Force pushbuffer location
    if(globals.forcePushbufferAlloc != CUDA_FORCE_PUSHBUFFER_DEFAULT) {
        config->allocVidmemPushbuffer = (globals.forcePushbufferAlloc == CUDA_FORCE_PUSHBUFFER_IN_VIDMEM);
    }

    // Allocate the GPFIFO tracking structure.
    gpfifo = (CUgpfifo* )malloc(sizeof(CUgpfifo) );
    if (!gpfifo) {
        status = CUDA_ERROR_OUT_OF_MEMORY;
        goto Error;
    }
    memset(gpfifo, 0, sizeof(CUgpfifo) );
    gpfifo->channel = channel;
    gpfifo->config = *config;
    gpfifo->gpuGet = 0;
    gpfifo->gpuPut = 0;
    gpfifo->cpuPut = 0;

    // Allocate the GPFIFO entries.
    //特别注意这里的大小计算
    CU_TRACE_PRINT(("GPFIFO entry count: %u\n", config->entryCount));
    entriesSize = config->entryCount * sizeof(*gpfifo->entries);
    gpfifo->entries = (CUgpfifoEntry *)malloc(entriesSize);
    if (!gpfifo->entries) {
        status = CUDA_ERROR_OUT_OF_MEMORY;
        goto Error;
    }
    memset(gpfifo->entries, 0, entriesSize);

    // Allocate the GPFIFO.
    memset(&memdesc, 0x0, sizeof(memdesc));

    memdesc.pChannel = channel;
    if (config->vidmemGpfifo) {
        memdesc.flags.location        = CU_MEM_LOCATION_DEVICE;
        memdesc.flags.cacheHost       = CU_MEM_CACHE_HOST_DISABLED;
    }
    else {
        memdesc.flags.location        = CU_MEM_LOCATION_HOST;
        memdesc.flags.cacheHost       = CU_GPFIFO_AND_PUSHBUFFER_HOST_CACHE_TYPE;
    }
    memdesc.flags.owner           = CU_MEM_OWNER_DRIVER;
    memdesc.flags.type            = CU_MEM_TYPE_GPFIFOBUFFER;
    memdesc.flags.mapHost         = CU_MEM_MAP_HOST_VA;
    memdesc.flags.noSuballoc      = !config->suballocateGpfifo;

    if (config->deviceMappedGpfifo) {
        // GPFIFO must use 40-bit VA.
        memdesc.flags.mapDevice = CU_MEM_MAP_DEVICE_VA_FORCE_40_BIT;
    }
    else {
        memdesc.flags.mapDevice = CU_MEM_MAP_DEVICE_NONE;
        // If we are not going to map the GPFIFO on the device there is no need to disable or set specific caching attributes
        // so simply set it the cache to enabled and override previous settings.
        memdesc.flags.cacheHost = CU_MEM_CACHE_HOST_ENABLED;
    }

    cuiPerformanceBegin("gpfifoMemobjAlloc", CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
    status = memobjAlloc(
        ctx->memmgr,
        &memdesc,
        CU_GPFIFO_ENTRY_SIZE * config->entryCount,
        &gpfifo->gpfifoMemobj);
    cuiPerformanceEnd("gpfifoMemobjAlloc", CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
    if (status != CUDA_SUCCESS) {
        CU_DEBUG_PRINT(("Unable to allocate GPFIFO\n"));
        goto Error;
    }


    // Allocate a pushbuffer in sysmem.
    status = pushbufferCreate(ctx, config->pushbufferSize, CU_PUSHBUFFER_IN_SYSMEM, &gpfifo->sysmemPushbuffer);
    if (status != CUDA_SUCCESS) {
        goto Error;
    }

    // Allocate a pushbuffer in vidmem.
    if (config->allocVidmemPushbuffer) {
        status = pushbufferCreate(ctx, config->pushbufferSize, CU_PUSHBUFFER_IN_VIDMEM, &gpfifo->vidmemPushbuffer);
        if (status != CUDA_SUCCESS) {
            goto Error;
        }
    }

    channel->gpfifo = gpfifo;
    return CUDA_SUCCESS;

Error:
    if (gpfifo) {
        if (gpfifo->vidmemPushbuffer) {
            pushbufferDestroy(gpfifo->vidmemPushbuffer);
        }
        if (gpfifo->sysmemPushbuffer) {
            pushbufferDestroy(gpfifo->sysmemPushbuffer);
        }
        if (gpfifo->gpfifoMemobj) {
            memobjFree(&gpfifo->gpfifoMemobj);
        }
        free(gpfifo->entries);
        free(gpfifo);
    }
    return status;
}

void
gpfifoDestroy(CUgpfifo* gpfifo)
{
    CUctx* ctx = NULL;

    CU_TRACE_FUNCTION();
    CU_ASSERT(gpfifo);

    ctx = gpfifo->channel->channelManager->ctx;

    // if there weren't any errors on this channel, then assert that the channel
    // got through all of the work that was pushed.
    // However before doing that we need to be sure the channel tracking semaphore
    // has been fully initialized since we might hit this codepath during channel init
    if (gpfifo->channel->trackingSemaphoreData.semaphore) {
        gpfifoAdvanceGpuGet(gpfifo);
    }

    if (CUDA_SUCCESS == cuiCtxCheckError(ctx, CUI_CTX_CHECK_STICKY_ONLY)) {
        CU_ASSERT(gpfifo->gpuGet == gpfifo->gpuPut && gpfifo->gpuGet == gpfifo->cpuPut);
    }

    if (gpfifo->vidmemPushbuffer) {
        pushbufferDestroy(gpfifo->vidmemPushbuffer);
    }
    if (gpfifo->sysmemPushbuffer) {
        pushbufferDestroy(gpfifo->sysmemPushbuffer);
    }
    if (gpfifo->gpfifoMemobj) {
        memobjFree(&gpfifo->gpfifoMemobj);
    }

    free(gpfifo->entries);
    free(gpfifo);
}

写数据

下面的代码时channel用来写数据的关键部分
CUSW_UNIT_CHANNEL implements a SW channel abstraction corresponding to HW channels for work submission. The CPU writes GPU methods (GPU commands) into the channel and GPU reads them from the channel. The CPU and GPU can be viewed as a producer-consumer pair talking to each other using a channel.

One channel acts as conduit of commands to one engine on the GPU (compute, memcpy, etc.). Methods submitted to a channel must be targeted for the engine that the channel is associated with. The channel’s type reflects this mapping. Multiple channels of same type are grouped into a channel pool as described later.

Each channel has a ring-buffer known as a pushbuffer. CPU writes GPU methods and data into the pushbuffer. The pointer at which data is written to by CPU is called as CPU put pointer (the tail of the pushbuffer). The pointer from which GPU reads is called as GPU get pointer (the head of the pushbuffer).

GPU methods written to a channel are taken up by GPU for execution in the order in which they were written. Work completion however may occur out of order for some types of channels. In addition, methods submitted in different channels may start executing in parallel.

GPU methods and data written to the channel’s pushbuffer may be buffered by the driver and sent to GPU in batches. Sending such buffered contents to GPU is termed as a channel flush. Clients of CUSW_UNIT_CHANNEL must explicitly ask the channel to flush its contents to ensure that the methods and data written to the channel will eventually be seen by the GPU. For instance, when busy-waiting on the CPU for a GPU action to take place (e.g. waiting for the completion of a kernel launch), the channel has to be flushed before starting the wait. Otherwise the GPU may never execute the action and the CPU thread waiting for it will deadlock.

/**
 * Give a set of push flags, return the appropriate pushbuffer object to start a push.
 */
CUpushbuffer*
gpfifoGetPushbuffer(CUgpfifo *gpfifo, FLAG_SET(CUIpushFlags) flags)
{
    NvBool preferSysmemPushbuffer = (flags & CUI_PUSH_PREFER_SYSMEM_PUSHBUFFER) != 0;
    CUpushbuffer *pushbuffer;
    CU_ASSERT(gpfifo);

    // If we have allocated two pushbuffers, then obey the push preference flags.
    // Assuming that pushbuffer location is not forced using the env var.
    // Otherwise,we must always use pinned system memory pushbuffers.
    if (preferSysmemPushbuffer && 
        (globals.forcePushbufferAlloc == CUDA_FORCE_PUSHBUFFER_DEFAULT)) {
        pushbuffer = gpfifo->sysmemPushbuffer;
    }
    else {
        if (gpfifo->vidmemPushbuffer) {
            pushbuffer = gpfifo->vidmemPushbuffer;
        }
        else {
            pushbuffer = gpfifo->sysmemPushbuffer;
        }
    }
    return pushbuffer;
}

static NvBool
gpfifoHasPushbufferSpace(CUgpfifo* gpfifo, NvU32 spaceNeeded, FLAG_SET(CUIpushFlags) flags)
{
    NvU32 i;
    CUpushbuffer *pushbuffer;

    CU_TRACE_FUNCTION();
    CU_ASSERT(gpfifo);
    CU_ASSERT(gpfifo->cpuPut <= gpfifo->config.entryCount);
    CU_ASSERT(gpfifo->gpuPut <= gpfifo->config.entryCount);

    // Get pushbuffer.
    pushbuffer = gpfifoGetPushbuffer(gpfifo, flags);

    // potentially perform the computation twice because updating the gpu get
    // can be expensive in the same manner as in gpfifoHasFreeEntries
    for (i = 0; i < 2; ++i) {
        // update the gpu get pointer only the second time around
        if (i == 1) {
            gpfifoAdvanceGpuGet(gpfifo);
        }
        if (pushbufferHasSpace(pushbuffer, spaceNeeded)) {
            return NV_TRUE;
        }
    }
    return NV_FALSE;
}

static void
gpfifoFlushGpfifoEntry(CUgpfifo *gpfifo)
{
    NvU32 cpuPutIndex;
    CUgpfifoEntry *cpuPutEntry;
    CUctx *ctx;

    CU_TRACE_FUNCTION();
    CU_ASSERT(gpfifo);

    ctx = gpfifo->channel->channelManager->ctx;

    // Get the current put index.
    cpuPutIndex = gpfifo->cpuPut;
    cpuPutEntry = gpfifo->entries + cpuPutIndex;

    // Splice in additional gpfifo entry for mps if needed.
    // - move "cpuPutEntry" forward by one
    // - add the supplied methods in the space
    if (gpfifo->mpsMethodsActive) {
        // preCpuPutIndex is the GPFIFO index where we'll be pointing to the
        // spliced methods
        NvU32 preCpuPutIndex = cpuPutIndex;
        CUgpfifoEntry *preCpuPutEntry = cpuPutEntry;

        // Assert that CPU architecture is not ARMv7.
        // Mps is currently not supported for ARM
        CU_ASSERT(!cuiArchIsARMv7());

        // cpuPutIndex is the push that's currently being filled, move it
        // forward by one, and copy its old data over
        cpuPutIndex = (gpfifo->cpuPut + 1) % gpfifo->config.entryCount;
        cpuPutEntry = gpfifo->entries + cpuPutIndex;
        memcpy(cpuPutEntry, preCpuPutEntry, sizeof(CUgpfifoEntry));

        // mark that the "pre" entry will be completed when the original cpu put entry
        // is done (by setting its trackSemValEnd to the current value)
        preCpuPutEntry->offset = 0;
        preCpuPutEntry->pushbuffer = NULL;
        preCpuPutEntry->length = 0;
        preCpuPutEntry->trackSemValStart = 0;
        preCpuPutEntry->trackSemValEnd = trackingSemaGetLastReleaseValue(&gpfifo->channel->trackingSemaphoreData);

        // build the gpfifo entry bits based on the spliced entry
        ctx->device->hal.setGpfifoEntry(
            (NvU32 *)gpfifoEntryCpuPointer(gpfifo, preCpuPutIndex),
            gpfifo->mpsMethodsDeviceVaddr, 
            gpfifo->mpsMethodsCount, 
            globals.forceGpfifoWait);

        // clear the state
        gpfifo->mpsMethodsActive = NV_FALSE;
        gpfifo->mpsMethodsDeviceVaddr = 0; 
        gpfifo->mpsMethodsCount = 0;

        // Update cpu put index
        gpfifo->cpuPut = cpuPutIndex;
    }

    // Report cpuPut, gpuPut as updated in GPU host
    cuiToolsNotifyFifoCpuPutStart(gpfifo, cpuPutEntry);
    // Write out cpu put to the gpfifo.
    CU_ASSERT(cpuPutEntry->pushbuffer || cpuPutEntry->pushbufferDevAddr);
    ctx->device->hal.setGpfifoEntry(
        (NvU32*)gpfifoEntryCpuPointer(gpfifo, cpuPutIndex),
        (cpuPutEntry->pushbuffer ? pushbufferGetDeviceVaddr(cpuPutEntry->pushbuffer) : cpuPutEntry->pushbufferDevAddr) + cpuPutEntry->offset,
        cpuPutEntry->length,
        globals.forceGpfifoWait || cpuPutEntry->wait);

    // Report cpuPut, gpuPut as updated in GPU host
    cuiToolsNotifyFifoCpuPutEnd(gpfifo, cpuPutEntry);

    // Get the tracking semaphore value at which this gpfifo entry will be complete.
    cpuPutEntry->trackSemValEnd = trackingSemaGetLastReleaseValue(&gpfifo->channel->trackingSemaphoreData);
    CU_ASSERT(cpuPutEntry->trackSemValStart <= cpuPutEntry->trackSemValEnd);
}

static void
gpfifoAdvanceCpuPut(CUgpfifo* gpfifo)
{
    NvU32 newCpuPutIndex;
    CUgpfifoEntry *newCpuPutEntry;

    CU_TRACE_FUNCTION();

    // Compute new cpu put index.
    newCpuPutIndex = (gpfifo->cpuPut + 1) % gpfifo->config.entryCount;
    newCpuPutEntry = gpfifo->entries + newCpuPutIndex;

    // Advance cpu put.
    gpfifo->cpuPut = newCpuPutIndex;

    // Clear new entry.
    memset(newCpuPutEntry, 0, sizeof(*newCpuPutEntry));
}

void
gpfifoAdvanceGpuGet(CUgpfifo* gpfifo)
{
    // Read the tracking sema value once and use it for the rest of the function
    CUtrackingSemaData *semaData = &gpfifo->channel->trackingSemaphoreData;
    NvU64 trackingSemaValue =
        trackingSemaDataUpdateAndGetLastCompleted(semaData, trackingSemaGetLastCompletedValue(semaData));

    CUgpfifoEntry* entry = NULL;
    CUgpfifoEntry* lastCompletedEntry = NULL;
    NvU32 start = gpfifo->gpuGet;
    NvU32 end = gpfifo->gpuPut;

    CU_TRACE_FUNCTION();

    // Quick check to see if we have wrap-around. We can then choose the wrap-around
    // point as a midpoint and eliminate a bunch of entries (either to the left or to the right)
    if (end < start) {
        entry = gpfifo->entries + (gpfifo->config.entryCount - 1);
        // If its not complete, we don't need to see past this point (into the wrap-around)
        if (!trackingSemaValueIsComplete(trackingSemaValue, entry->trackSemValEnd)) {
            end = gpfifo->config.entryCount - 1;
        }
        else {
            // Otherwise its complete, we can discard everything before and start at the wrap-around point
            lastCompletedEntry = entry;
            start = 0;
        }
    }

    // Look to see which entries are completed
    // If entry 'i + 1' is complete, this implies entry 'i' is complete
    while (start < end) {
        entry = gpfifo->entries + start;
        if (!trackingSemaValueIsComplete(trackingSemaValue, entry->trackSemValEnd)) {
            break;
        }
        lastCompletedEntry = entry;
        start++;
    }

    // Update pushbuffer at once (by setting 'get' directly) instead of entry by entry
    if (lastCompletedEntry && lastCompletedEntry->pushbuffer) {
        CU_ASSERT(lastCompletedEntry->trackSemValEnd > 0);
        CU_ASSERT(trackingSemaHasCompletedValue(semaData, lastCompletedEntry->trackSemValEnd));
        // MPS server can push entries without a PB associated to track client submissions
        // These entries are spliced in and paired with the actual client submission,
        // 'paired' meaning they both have the same tracking sema release value.
        // So if one is done, the other is as well, therefore we should never have a case where the
        // entry doesn't have a pushbuffer, since the one WITH the pushbuffer (server-side) would be done as well.
        pushbufferSetGet(lastCompletedEntry->pushbuffer, (NvU32)(lastCompletedEntry->offset + lastCompletedEntry->length));
        gpfifo->gpuGet = start;
    }
}

/*
 *
 * GPFIFO and pushbuffer offset and pointer accessors
 *
 */
NvU64
gpfifoEntryOffset(CUgpfifo* gpfifo, NvU32 entry)
{
    return memobjGetDeviceVaddr(gpfifo->gpfifoMemobj) + entry * CU_GPFIFO_ENTRY_SIZE;
}

void*
gpfifoEntryCpuPointer(CUgpfifo* gpfifo, NvU32 entry)
{
    return (void*)((char*)memobjGetHostPtr(gpfifo->gpfifoMemobj) + entry * CU_GPFIFO_ENTRY_SIZE);
}

/*
 *
 * channel DMAL methods for DMs that use GPFIFOs
 *
 */

NvBool
channelCanAdvanceGPFIFO(CUnvchannel *channel, NvU32 spaceRequested, FLAG_SET(CUIpushFlags) flags)
{
    CUchannelFlushUnit *channelFlushUnit = channel->channelFlushUnit;
    CUgpfifo *gpfifo;
    NvU32 entryCountRequired = CU_GPFIFO_ENTRY_PAD_COUNT;

    CU_TRACE_FUNCTION();

    // Get current gpfifo entry
    gpfifo = channel->gpfifo;
    channelFlushUnit = channel->channelFlushUnit;

    // This can't be queried during a push
    CU_ASSERT(!channel->channelManager->currentPush.active);

    // Verify that we have spaceRequested bytes available to write.
    if (!gpfifoHasPushbufferSpace(gpfifo, spaceRequested, flags)) {
        return NV_FALSE;
    }

    // Verify that we have a few GPFIFO entries to cut as well.
    if (!gpfifoHasFreeEntries(gpfifo, entryCountRequired)) {
        return NV_FALSE;
    }

    // If this is a slave channel in TSG, it also needs to verify that the master channel, which we will
    // have to push a join with this channel when this channel flushes, has enough GPFIFO space.
    // Please notice that this will also get executed on AModel, which is not necessary at this point,
    // but if at any point we make AModel asynchronously, then this might be needed.
    if (channel != channelFlushUnit->channelHead) {
        if (!gpfifoHasFreeEntries(channelFlushUnit->channelHead->gpfifo, entryCountRequired)) {
            return NV_FALSE;
        }
    }

    return NV_TRUE;
}

/**
 * Get a pointer to the start of the pushbuffer associated with this gpfifo entry.
 */
static CUnvCurrent*
gpfifoEntryGetPushbufferStart(CUgpfifoEntry *entry)
{
    void *pushbufferBaseAddr;
    CU_ASSERT(entry);
    pushbufferBaseAddr = pushbufferGetHostVaddr(entry->pushbuffer);
    return (CUnvCurrent*)((uintptr_t)pushbufferBaseAddr + (NvU32)entry->offset);
}

/**
 * Get a pointer to the end of the pushbuffer associated with this gpfifo entry, where new methods can be written.
 */
static CUnvCurrent*
gpfifoEntryGetPushbufferEnd(CUgpfifoEntry *entry)
{
    CU_ASSERT(entry);
    return (CUnvCurrent*)((uintptr_t)gpfifoEntryGetPushbufferStart(entry) + entry->length);
}

NvBool
channelCanContinueGPFIFO(CUnvchannel *channel, CUnvCurrent *nvCurrent, NvU32 additionalBytesRequested, FLAG_SET(CUIpushFlags) flags)
{
    CUgpfifo *gpfifo = channel->gpfifo;
    CUgpfifoEntry *cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;
    CUpushbuffer *pushbuffer = gpfifoGetPushbuffer(gpfifo, flags);
    NvU32 spaceAlreadyUsed = (NvU32)((char*)nvCurrent - (char*)gpfifoEntryGetPushbufferStart(cpuPutEntry));

    NvBool canContinue = pushbufferHasSpaceToContinueToTotalSize(pushbuffer, additionalBytesRequested + spaceAlreadyUsed);
    if (canContinue) {
        cpuPutEntry->spaceReserved = additionalBytesRequested + spaceAlreadyUsed;
    }
    return canContinue;
}

void
channelGetPutPointerGPFIFO(CUnvchannel *channel, CUnvCurrent **pnvCurrent, NvU32 spaceRequested, FLAG_SET(CUIpushFlags) flags)
{
    CUgpfifo *gpfifo;
    CUgpfifoEntry *cpuPutEntry;
    CUpushbuffer *pushbuffer;
    NvBool startNewPushbuffer = NV_FALSE;

    CU_TRACE_FUNCTION();
    CU_ASSERT(channel && channel->gpfifo);

    // Get the current gpfifo entry.
    gpfifo = channel->gpfifo;
    cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;

    // We should never run out of GPFIFO entries at this point CanAdvance won't return
    // true until there are a few spare "padding" entries.
    CU_ASSERT(gpfifoHasFreeEntries(gpfifo, CU_GPFIFO_ENTRY_PAD_COUNT));
    CU_ASSERT(gpfifoHasPushbufferSpace(gpfifo, spaceRequested, flags));

    startNewPushbuffer = !cpuPutEntry->pushbuffer;

    // If we are doing a raw submission we have to have our own pushbuffer
    if (!startNewPushbuffer && (flags & CUI_PUSH_RAW))
    {
        gpfifoFlushGpfifoEntry(gpfifo);
        gpfifoAdvanceCpuPut(gpfifo);
    }

    CU_ASSERT(spaceRequested > 0 || (flags & CUI_PUSH_RAW));

    // Initialize the cpu put entry, if needed.
    if (startNewPushbuffer) {

        if (spaceRequested == 0) {
            // The caller has not asked us to allocate a pushbuffer for this entry
            // therefore we assume it will be provided by the user
            cpuPutEntry->offset = 0;
            cpuPutEntry->pushbuffer = NULL;
            cpuPutEntry->length = 0;
            cpuPutEntry->spaceReserved = 0;
            cpuPutEntry->trackSemValStart = 0;
            cpuPutEntry->trackSemValEnd = 0;
        }
        else {
            // Get pushbuffer.
            pushbuffer = gpfifoGetPushbuffer(gpfifo, flags);

            // Fill out new cpu put entry.
            cpuPutEntry->offset = pushbufferStartPush(pushbuffer, spaceRequested);
            cpuPutEntry->pushbuffer = pushbuffer;
            cpuPutEntry->length = 0;
            cpuPutEntry->spaceReserved = spaceRequested;
            cpuPutEntry->trackSemValStart = 0;
            cpuPutEntry->trackSemValEnd = 0;
        }
    }
    else {
        // We have reserved spaceRequested space + size of bytes we queued up in this entry already
        cpuPutEntry->spaceReserved = cpuPutEntry->length + spaceRequested;
    }


    // Get the current pushbuffer pointer.
    if (pnvCurrent) {
        if (spaceRequested == 0) {
            *pnvCurrent = NULL;
        }
        else {
            *pnvCurrent = gpfifoEntryGetPushbufferEnd(cpuPutEntry);
        }
    }
}

void
channelSetPutPointerGPFIFO(CUnvchannel* channel, CUnvCurrent* nvCurrent, NvBool *needsFlush)
{
    CUgpfifo *gpfifo;
    CUgpfifoEntry *cpuPutEntry;
    NvU32 length;
    const NvU32 maxCpuPutLength = channel->maxQueueLength + CU_PUSHBUF_MAX_PUSH_SIZE_DEFAULT;

    CU_TRACE_FUNCTION();
    gpfifo = channel->gpfifo;


    // Compute the new CPU put length
    cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;

    // If this entry is not associated to any pushbuffer space we have nothing to do but writting this entry
    if (!cpuPutEntry->pushbuffer) {
        gpfifoFlushGpfifoEntry(gpfifo);
        gpfifoAdvanceCpuPut(gpfifo);
        return;
    }

    length = (NvU32)((char*)nvCurrent - (char*)gpfifoEntryGetPushbufferStart(cpuPutEntry));

    // Inform the pushbuffer how much more space has been written to since the last channelGetPutPointer().
    pushbufferEndPush(cpuPutEntry->pushbuffer, length - cpuPutEntry->length);

    // Set the length of the current push.
    cpuPutEntry->length = length;

    // Report cpuPut, gpuPut as updated in GPU host
    // At this point if we made call back into the driver we could
    // replace pushbuffer with user defined pushbuffer.
    cuiToolsNotifyFifoPushbufferCreate(gpfifo, length, cpuPutEntry, channel);

    // verify we wrote a multiple of 4 bytes
    CU_ASSERT(cpuPutEntry->length % sizeof(NvU32) == 0);

    // verify that we haven't written off the end of this pushbuffer allocation
    CU_ASSERT(cpuPutEntry->length <= cpuPutEntry->spaceReserved);

    // If we're allowed to queue this, do so.
    {
        NvBool queuingLimitReached  = cpuPutEntry->length >= channel->maxQueueLength;
        NvU64  pushbufferFreeSpace  = pushbufferGetSize(cpuPutEntry->pushbuffer) - cpuPutEntry->pushbuffer->put;
        NvBool pushbufferTooFull    = (pushbufferFreeSpace < CU_PUSHBUF_MAX_PUSH_SIZE_DEFAULT);
        
        *needsFlush = (queuingLimitReached || pushbufferTooFull);
    }
}

CUresult
channelFlushUnitFlushGPFIFO(CUchannelFlushUnit *channelFlushUnit)
{
    CUresult status = CUDA_SUCCESS;
    CUnvchannel* channel = NULL;
    CUgpfifoFlushItem flushItem;
    CUgpfifoEntry *cpuPutEntry;
    CUgpfifo *gpfifo;

    CU_TRACE_FUNCTION();
    CU_ASSERT(channelFlushUnit);

    flushItem.flushUnit = channelFlushUnit;
    flushItem.gpfifoCount = 0;
    flushItem.awakenAfterCompletion = NV_FALSE;

    // Iterate through all channels within the same channel flush unit
    // to cut a new GPFIFO entry with any work needed
    for (channel = channelFlushUnit->channelHead; channel; channel = channel->flushUnitNext) {
        NvU32 entryCount = 0;
        gpfifo = channel->gpfifo;

        // Compute new cpu put index.
        cpuPutEntry = gpfifo->entries + gpfifo->cpuPut;
        if (cpuPutEntry->length == 0) {
            continue;
        }

        // Write out the current cpu put to the gpfifo.
        gpfifoFlushGpfifoEntry(gpfifo);

        // Advance the cpu put pointer.
        gpfifoAdvanceCpuPut(gpfifo);

        // Populate the flush structure with the entries to flush.
        entryCount = (gpfifo->cpuPut >= gpfifo->gpuPut) ?
                     (gpfifo->cpuPut - gpfifo->gpuPut) :
                     (gpfifo->config.entryCount - gpfifo->gpuPut + gpfifo->cpuPut);
        if (entryCount == 0) {
            continue;
        }

        // Populate the flush item
        flushItem.gpfifos[flushItem.gpfifoCount].gpfifo = gpfifo;
        flushItem.gpfifos[flushItem.gpfifoCount].firstEntry = gpfifo->gpuPut;
        flushItem.gpfifos[flushItem.gpfifoCount].entryCount = entryCount;
        flushItem.gpfifoCount += 1;

        // Track if an awaken is expected after the command buffer finishes
        if (channel->blockingSync.unflushedAwaken) {
            CU_ASSERT(!channel->blockingSync.useInterrupt);
            channel->blockingSync.unflushedAwaken = NV_FALSE;
            flushItem.awakenAfterCompletion = NV_TRUE;
        }

        // Mark all work as having been flushed
        trackingSemaFlushLastReleaseValue(&channel->trackingSemaphoreData);
        gpfifo->gpuPut = gpfifo->cpuPut;

    }

    // If we have nothing to flush, early-out
    if (flushItem.gpfifoCount == 0) {
        return CUDA_SUCCESS;
    }

    // If pushbuffer-dumping is enabled, write this 'flushitem' to the
    // pushbuffer dump file.
    if (globals.pbdump.enable) {
        pushbufferDumpWriteGpfifoFlushItem(globals.pbdump.file, &flushItem);
    }

    // The main goal of this callbacks is to provide time when we perform GPU Put update.
    // For those "rare" dmals that update GPU Put in shot for several pushbuffers this will
    // clearly report only last pushbuffer. But it still serves its purpose.
    cuiToolsNotifyFifoGpuPutStart(gpfifo, cpuPutEntry);
    // Pass the flush structure down to the GPFIFO DM layer
    status = channelFlushUnit->channelHead->dmal.GpfifoAdvanceGpuPut(&flushItem);
    cuiToolsNotifyFifoGpuPutEnd(gpfifo, cpuPutEntry);

    return status;
}

CUnvCurrent*
gpfifoSpliceInExternalBuffers(
    CUnvCurrent* nvCurrent,
    CUnvchannel* channel,
    NvU32  numExternalBuffers,
    NvU32* spliceInOffsets,
    NvU32* buffer,
    NvU32 bufferSize,
    NvU64* externalBufferAddresses,
    NvU32* externalBufferSizes)
{
    NvU32 i = 0;
    for (i = 0; i < numExternalBuffers + 1; i++) {
        NvU32 begin = (i == 0 ? 0 : spliceInOffsets[i - 1] + externalBufferSizes[i - 1]);
        NvU32 end = (i == numExternalBuffers ? bufferSize : spliceInOffsets[i]);
        if (begin >= end) {
            break;
        }

        CU_DEBUG_PRINT(("Splicing an %d bytes external buffer at address 0x%llu at offset %d\n",
                         externalBufferSizes[i], externalBufferAddresses[i], end)); 
 
        // 1. Copy the buffer containing the methods to the current pushbuffer from "begin" to "end"
        memQuickCopyNontemporal(nvCurrent, buffer + (begin / sizeof(NvU32)), end - begin);
        nvCurrent += (end - begin) / sizeof(NvU32);
        if (i == numExternalBuffers) {
            break;
        }
        // End the current submission
        channelEndPushInternal_UnderLock(channel, nvCurrent, NV_FALSE, CUI_END_PUSH_RAW);
 
        // 2. Splice in a new gpfifo entry
        channelPushExternalPushbuffer_UnderLock(channel, NV_TRUE, externalBufferAddresses[i], externalBufferSizes[i]);
 
        // 3. Restart the submission
        channelBeginPushInternal_UnderLock(channel, &nvCurrent, CU_PUSHBUF_MAX_PUSH_SIZE_DEFAULT, CUI_PUSH_RAW);
    }

    return nvCurrent;
}

static NV_INLINE void cuiToolsNotifyFifoCpuPutEnd(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
    (void)gpfifo;
    (void)cpuPutEntry;
#else
    if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_END))
    {
       CUtoolsFifoCpuPutEnd Params = {0};
       Params.struct_size      = sizeof(Params);
       Params.ctx              = gpfifo->channel->channelManager->ctx;
       if (cpuPutEntry->length)
          Params.pushBuffer       = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
       Params.pushBufferLength = cpuPutEntry->length;
       Params.channelID        = gpfifo->channel->chID;
       Params.channelType      = channelGetType(gpfifo->channel);
       Params.cpuPut           = gpfifo->cpuPut;
       Params.gpuPut           = gpfifo->gpuPut;
       Params.gpuGet           = gpfifo->gpuGet;
       Params.tsgGroup         = gpfifo->channel->channelFlushUnit->tsgGroupID;
       toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_END, (void *)&Params);
    }
#endif
}

static NV_INLINE void cuiToolsNotifyFifoCpuPutStart(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
    (void)gpfifo;
    (void)cpuPutEntry;
#else
    if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_START))
    {
       CUtoolsFifoCpuPutStart Params = {0};
       Params.struct_size      = sizeof(Params);
       Params.ctx              = gpfifo->channel->channelManager->ctx;
       if (cpuPutEntry->length)
          Params.pushBuffer       = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
       Params.pushBufferLength = cpuPutEntry->length;
       Params.channelID        = gpfifo->channel->chID;
       Params.channelType      = channelGetType(gpfifo->channel);
       Params.cpuPut           = (gpfifo->cpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
       Params.gpuPut           = gpfifo->gpuPut;
       Params.gpuGet           = gpfifo->gpuGet;
       Params.tsgGroup         = gpfifo->channel->channelFlushUnit->tsgGroupID;
       toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_CPU_PUT_START, (void *)&Params);
    }
#endif
}

static NV_INLINE void cuiToolsNotifyFifoGpuPutEnd(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
    (void)gpfifo;
    (void)cpuPutEntry;
#else
    if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_END))
    {
       CUtoolsFifoGpuPutEnd Params = {0};
       Params.struct_size = sizeof(Params);
       Params.ctx         = gpfifo->channel->channelManager->ctx;
       if (cpuPutEntry->length)
          Params.pushBuffer  = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
       Params.pushBufferLength = cpuPutEntry->length;
       Params.channelID   = gpfifo->channel->chID;
       Params.channelType = channelGetType(gpfifo->channel);
       Params.cpuPut      = (gpfifo->cpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;;
       Params.gpuPut      = gpfifo->gpuPut;
       Params.gpuGet      = gpfifo->gpuGet;
       Params.tsgGroup    = gpfifo->channel->channelFlushUnit->tsgGroupID;
       toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_END, (void *)&Params);
    }
#endif
}

static NV_INLINE void cuiToolsNotifyFifoGpuPutStart(CUgpfifo *gpfifo, CUgpfifoEntry *cpuPutEntry)
{
#if !TOOLS_ENABLED
    (void)gpfifo;
    (void)cpuPutEntry;
#else
    if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_START))
    {
       CUtoolsFifoGpuPutStart Params = {0};
       Params.struct_size = sizeof(Params);
       Params.ctx         = gpfifo->channel->channelManager->ctx;
       if (cpuPutEntry->length)
          Params.pushBuffer  = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
       Params.pushBufferLength = cpuPutEntry->length;
       Params.channelID   = gpfifo->channel->chID;
       Params.channelType = channelGetType(gpfifo->channel);
       Params.cpuPut      = (gpfifo->cpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
       Params.gpuPut      = (gpfifo->gpuPut + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
       Params.gpuGet      = gpfifo->gpuGet;
       Params.tsgGroup    = gpfifo->channel->channelFlushUnit->tsgGroupID;
       toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_GPU_PUT_START, (void *)&Params);
    }
#endif
}

static NV_INLINE void cuiToolsNotifyFifoPushbufferCreate(CUgpfifo *gpfifo, NvU32 length, CUgpfifoEntry *cpuPutEntry, CUnvchannel *channel)
{
#if !TOOLS_ENABLED
    (void)gpfifo;
    (void)length;
    (void)cpuPutEntry;
    (void)channel;
#else
    if (toolsCallbackEnabled( CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_PUSHBUFFER_CREATE))
    {
       CUtoolsFifoPushbufferCreate Params = {0};
       Params.struct_size      = sizeof(Params);
       Params.ctx              = gpfifo->channel->channelManager->ctx;
       if (length)
          Params.pushBuffer       = (char *)pushbufferGetHostVaddr(cpuPutEntry->pushbuffer) + cpuPutEntry->offset;
       Params.pushBufferLength = length;
       Params.channelID        = gpfifo->channel->chID;
       Params.channelType      = channelGetType(gpfifo->channel);
       Params.cpuPut           = (gpfifo->cpuPut  + gpfifo->config.entryCount - 1) % gpfifo->config.entryCount;
       Params.gpuPut           = gpfifo->gpuPut;
       Params.gpuGet           = gpfifo->gpuGet;
       Params.tsgGroup         = channel->channelFlushUnit->tsgGroupID;
       toolsIssueCallback(CU_TOOLS_CB_DOMAIN_FIFO, CU_TOOLS_CBID_FIFO_PUSHBUFFER_CREATE, (void *)&Params);
    }
#endif
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值