CUDA系列-QMD-2

QMD是CUDA 用来追踪kernel launch的,主要有两部分组成QMD和QMDPOOL


CODE OVERVIE

QMD POOL

// QMD pool, including headers, the QMD themselves, and other meta-data
struct CUqmdPool_st
{
    CUctx     *ctx;             // The owning context
    CUmemobj  *poolMemobj;      // memory for the device hw QMDs

    CUsemaPool *qmdSemaphores;
    NvBool     hasUnregisteredSemaPages; // If the associated semaphore pages are not made portable

    CUqmd     *pool;            // QMD headers, one per underlying QMD in the memobj
    NvU32      poolSize;        // number of QMDs in the pool

    CUqmd    **freeStack;       // stack of free entries in the pool
    NvU32      freeStackTop;    // top of the stack

    CUqmdPool *next;            // Next and previous pools, if we've added more 
    CUqmdPool *prev;            // pools than the one created upon ctx creation.
};

CUresult qmdPoolCreate(CUctx *ctx, CUqmdPool **pool);
void qmdPoolDestroy(CUqmdPool *p);
CUresult qmdRegisterSemaphorePoolIfNeeded(CUqmd **qmds, NvU32 count);

QMDPool Create

这里面最关键的是创建了一个semaphorePoolCreate()


CUresult qmdPoolCreate(CUctx *ctx, CUqmdPool **pool)
{
    CUresult status = CUDA_SUCCESS;
    CUqmdPool *p = NULL;
    CUmemobj *memobj = NULL;
    CUmemdesc memdesc;
    NvU64 qmdSize = 0;
    NvU64 devVABase;
    NvU32 i;
    NvBool enableCacheQmdSemaPool;

    CU_TRACE_FUNCTION();
    CU_ASSERT(pool);

    *pool = NULL;

    // initialize pool only on supported devices
    if (ctx->device->state.major < 3) {
        return CUDA_SUCCESS;
    }

    // allocate pool
    p = (CUqmdPool *)malloc(sizeof(*p));
    if (!p) {
        CU_ERROR_PRINT(("Failed to malloc QMD pool\n"));
        status = CUDA_ERROR_OUT_OF_MEMORY;
        goto Error;
    }
    memset(p, 0, sizeof(*p));
    p->ctx = ctx;

    // allocate pool cpu data structure
    p->poolSize = QMD_POOL_SIZE;

#if !(defined(PUBLIC_RELEASE) && defined(RELEASE))
    if (globals.qmdPoolSize) {
        p->poolSize = globals.qmdPoolSize;
        CU_DEBUG_PRINT(("Setting QMD pool size to: %u\n", p->poolSize));
    }
#endif

    enableCacheQmdSemaPool = cuiDeviceSupportsSemaphoreCaching(ctx->device);

    // Create a semaphore pool with gpu caching enabled.
    status = semaphorePoolCreate(ctx, CU_SEMA_TYPE_QMD, enableCacheQmdSemaPool, &p->qmdSemaphores);
    if (status != CUDA_SUCCESS) {
        CU_ERROR_PRINT(("Failed to create the QMD semaphore pool\n"));
        goto Error;
    }
    p->hasUnregisteredSemaPages = NV_TRUE;

    p->pool = (CUqmd *)malloc(p->poolSize * sizeof(*(p->pool)));
    if (!p->pool) {
        CU_ERROR_PRINT(("Failed to malloc pool CPU data structure\n"));
        goto Error;
    }
    memset(p->pool, 0, p->poolSize * sizeof(*(p->pool)));

    qmdSize = ctx->device->hal.getConstant(CU_HAL_CONST_QMD_DEVICE_SIZE);

    // allocate pool memory on device. don't suballocate because if the application
    // is a mps client, and it doesn't exit cleanly, the mps server will have to
    // invalidate SKED's QMD cache on the client's behalf. Not suballocating helps the
    // server identify the base address of the QMD pool correctly.
    // QMD must be 40-bit VA.
    memset(&memdesc, 0x0, sizeof(memdesc));
    memdesc.flags.location   = CU_MEM_LOCATION_DEVICE;
    memdesc.flags.mapHost    = CU_MEM_MAP_HOST_NONE;
    memdesc.flags.mapDevice  = CU_MEM_MAP_DEVICE_PTR_FORCE_40_BIT;
    memdesc.flags.type       = CU_MEM_TYPE_QMD;
    memdesc.flags.owner      = CU_MEM_OWNER_DRIVER;
    memdesc.flags.noSuballoc = NV_TRUE;
    status = memobjAlloc(ctx->memmgr, &memdesc, qmdSize * p->poolSize, &memobj);
    if (CUDA_SUCCESS != status) {
        CU_ERROR_PRINT(("Failed to allocate QMD memobj\n"));
        goto Error;
    }
    CU_TRACE_PRINT(("Allocated QMD memobj on device\n"));
    p->poolMemobj = memobj;

    devVABase = memobjGetDeviceVaddr(p->poolMemobj);

    // initialize each QMD
    for (i = 0; i < p->poolSize; i++) {
        CUqmd *q = &p->pool[i];

        q->pool = p;

        q->devVA = devVABase + i * qmdSize;
        q->size = qmdSize;

        status = semaphoreAlloc(p->qmdSemaphores, &q->semaphore);
        if (CUDA_SUCCESS != status) {
            CU_ERROR_PRINT(("Failed to allocate semaphore for QMD %u\n", i));
            goto Error;
        }
    }

    // allocate the free stack
    p->freeStack = (CUqmd **)malloc(p->poolSize * sizeof(*(p->freeStack)));
    if (!p->freeStack) {
        CU_ERROR_PRINT(("Failed to allocate pool free stack\n"));
        goto Error;
    }
    for (i = 0; i < p->poolSize; i++) {
        p->freeStack[i] = &p->pool[i];
    }
    p->freeStackTop = 0;

    *pool = p;

    return status;

Error:
    qmdPoolDestroy(p);
    return status;
}

QMDPool Destroy

void qmdPoolDestroy(CUqmdPool *p)
{
    CU_TRACE_FUNCTION();

    if (p) {
        if (p->pool) {
            NvU32 i;
            for (i = 0; i < p->poolSize; i++) {
                if (p->pool[i].semaphore) {
                    semaphoreFree(p->pool[i].semaphore);
                }
            }
            memset(p->pool, 0, p->poolSize * sizeof(*(p->pool)));
            free(p->pool);
        }
        if (p->poolMemobj) {
            memobjFree(&(p->poolMemobj));
        }
        if (p->freeStack) {
            free(p->freeStack);
        }
        semaphorePoolDestroy(&p->qmdSemaphores);
        memset(p, 0, sizeof(*p));
        free(p);
    }
}

QMD

QMD其实是对CUsema 的一个wrap,

// header and tracking information for each QMD
struct CUqmd_st
{
    CUqmdPool *pool;            // The owning pool
    NvU64 devVA;                // device-side QMD
    NvU64 size;                 // size in bytes of the HW QMD

    CUsema *semaphore;          // semaphore tracking completion
    NvU32 cachedSemaValue;      // Last known value of the QMD semaphore

    NvBool allocated;           // if the QMD is allocated to an owner (e.g., a stream)

    struct                      // per-launch data
    {
        // a unique identifier for the launch.
        // Updated atomically so that it can be read w/o holding the marker mutex
        NvU64 id;

        // the last launch id that has done its ECC check
        // - only updated while holding CUctx::errorCheckMutex
        NvU64 lastEccCheckedLaunchId;
    } launch;

    NvBool isInActiveList;      // if the QMD is in activeQmdList in channelManager
    CUqmd *activeListPrev;      // previous QMD in activeQmdList
    CUqmd *activeListNext;      // next QMD in activeQmdList

    CUqmd *completionQmd;       // This points to a different QMD when the current QMD skips its semaphore release
    NvU64 completionQmdLaunchId;// The launch which the completionQMD must reach to signify this QMDs completion.
};


// Attempts to allocate count QMD's from the given pool. It will return the
// number of QMDs that were actually allocated.
static NvU32
qmdAllocateFromPool_underQmdLock(CUqmdPool *p, NvU32 count, CUqmd **outQmds)
{
    NvU32 i;
    NvU32 numQmdsToAllocate;

    CU_TRACE_FUNCTION();
    CU_ASSERT(p);
    CU_ASSERT(outQmds);

    numQmdsToAllocate = ((p->poolSize - p->freeStackTop) >= count)? count : (p->poolSize - p->freeStackTop);

    for (i = 0; i < numQmdsToAllocate; ++i) {
        outQmds[i] = p->freeStack[p->freeStackTop];
        p->freeStackTop += 1;
        outQmds[i]->allocated = NV_TRUE;
        outQmds[i]->completionQmd = NULL;
        outQmds[i]->completionQmdLaunchId = 0;
    }

    return numQmdsToAllocate;
}

// Adds the QMD back to the free list of the pool
static void
qmdFree_underQmdLock(CUqmd *qmd)
{
    CUqmdPool *p = qmd->pool;
    CUchannelManager *channelManager = p->ctx->channelManager;

    CU_TRACE_FUNCTION();
    CU_ASSERT(p);
    CU_ASSERT(p->freeStackTop > 0);
    CU_ASSERT(qmd);

    // The QMD may still exist in the channel manager active QMD list.  
    // Be sure to remove it from the channel manager list
    cuiMutexLock(&channelManager->markerMutex);
    channelManagerRemoveActiveQmdList_UnderLock(channelManager, qmd);
    cuiMutexUnlock(&channelManager->markerMutex);

    p->freeStackTop -= 1;
    p->freeStack[ p->freeStackTop ] = qmd;
}

FUNC

从下面func可以看出其和Semaphore强相关

/**
 * The QMD semaphore payload for each launch can reach three distinct values.
 * They are all offset from the base which is launch.id * 4 (CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH).
 * Initially it is set to 1 (CUI_QMD_SEMAPHORE_STATE_INITIAL), once
 * the kernel is finished it is set to 2 (CUI_QMD_SEMAPHORE_STATE_FINAL), and while the kernel is running
 * it can also reach 0 (CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP). The temporary state is only reached
 * if the kernel starts any CNP work.
 *
 * Having a distinct value for CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP makes it easy to distinguish
 * between a semaphore in the CUI_QMD_SEMAPHORE_STATE_FINAL state and the CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP
 * state just by looking at the payload. This allows for an easy implementation of qmdSemaphoreIsFinalPayload()
 * which is used by the debugger.
 *
 * See qmdSemaphoreGetInitialPayload(), qmdSemaphoreGetFinalPayload() and qmdSemaphoreIsFinalPayload().
 */

#define CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH 4
// The state base increment has to be a power of 2 so that semaphore overflow doesn't break
// the property of payload modulo CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH being the current state.
// This property is assumed in qmdSemaphoreIsFinalPayload().
ct_assert((CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH & (CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH - 1)) == 0);

#define CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP 0
#define CUI_QMD_SEMAPHORE_STATE_INITIAL 1
#define CUI_QMD_SEMAPHORE_STATE_FINAL 2


// Get the initial payload used for the semaphore for current launch
NvU32 qmdSemaphoreGetInitialPayload(const CUqmd *qmd);
// Get the final payload used for the semaphore for current launch
NvU32 qmdSemaphoreGetFinalPayload(const CUqmd *qmd);
// Get the final payload used for the semaphore for a launch
NvU32 qmdSemaphoreGetFinalPayloadForLaunch(const CUqmd *qmd, NvU64 launchId);
// Returns whether a payload is a final one
NvBool qmdSemaphoreIsFinalPayload(NvU32 payload);
// Returns the difference between the final and initial payload
NvU32 qmdSemaphoreGetDifferenceBetweenFinalAndInitial(const CUqmd *qmd);

// Initialize the QMD's semaphore for a launch
CUnvCurrent *qmdSemaphoreInitializeForLaunch(CUnvCurrent *nvCurrent, CUctx *ctx, CUqmd *qmd);
// Get the completion marker entry for a QMD
void qmdGetCompletionMarkerEntry(CUqmd *qmd, CUnvchannel *channel, NvU64 trackSemValue, CUctxMarkerEntry *entry_out);

// Deallocate count QMD's from their owner (e.g., stream)
void qmdDeallocate(CUctx *ctx, CUqmd **qmds, NvU32 count);

// return true if the specific launch is complete
NvBool qmdIsCompletedForLaunch(CUqmd *qmd, NvU64 launchId);
// Atomically read the launch.id of the given QMD
NvU64 qmdGetLaunchId(const CUqmd *qmd);
// return the marker status of the qmd for the pendingLaunchId and free the qmd if it's completed
CUctxMarkerStatus qmdGetStatusForMarker(CUctxMarkerPendingQMD *pendingQmd);

// Invalidate SKED's QMD cache
// For GM20x+ chips, this invalidates all QMD caches.
// For pre-GM20x chips, this only invalidates driver QMD pool, and
// doesn't invalidate CNP qmd pool.
CUresult qmdInvalidateSkedCache(CUctx *ctx, NvU64 qmdPoolVaddr);

QMD与Sem的操作


NvU64 qmdGetLaunchId(const CUqmd *qmd)
{
    return cuosAtomicReadSeqCst64(&qmd->launch.id);
}

NvU32 qmdSemaphoreGetInitialPayload(const CUqmd *qmd)
{
    return (NvU32)(qmdGetLaunchId(qmd) * CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH + CUI_QMD_SEMAPHORE_STATE_INITIAL);
}

NvU32 qmdSemaphoreGetFinalPayloadForLaunch(const CUqmd *qmd, NvU64 launchId)
{
    return (NvU32)(launchId * CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH + CUI_QMD_SEMAPHORE_STATE_FINAL);
}

NvU32 qmdSemaphoreGetFinalPayload(const CUqmd *qmd)
{
    return qmdSemaphoreGetFinalPayloadForLaunch(qmd, qmdGetLaunchId(qmd));
}

NvU32 qmdSemaphoreGetDifferenceBetweenFinalAndInitial(const CUqmd *qmd)
{
    return CUI_QMD_SEMAPHORE_STATE_FINAL - CUI_QMD_SEMAPHORE_STATE_INITIAL;
}

NvBool qmdSemaphoreIsFinalPayload(NvU32 payload)
{
    return (payload % CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH) == CUI_QMD_SEMAPHORE_STATE_FINAL;
}



    // if launchId is zero, then this QMD has never been used
    if (launchId == 0) {
        return NV_TRUE;
    }

    // If the cached semaphore value shows completion for this launchId,
    // there's no need to access the actual semaphore payload
    if (CUDA_WRAPCOMPARE_A_GE_B(cuosAtomicReadAcquire32(&qmd->cachedSemaValue), qmdSemaphoreGetFinalPayloadForLaunch(qmd, launchId))) {
        return NV_TRUE;
    }

    // Cache semaphore value.
    // Here we may write a stale value to cachedSemaValue due to a race between
    // multiple threads. However that is fine because the thread(s) that are
    // waiting to see a higher value will skip the above if and get here again
    // to update cachedSemaValue.
    payload = semaphoreGetPayload(qmd->semaphore);
    cuosAtomicWriteRelease32(&qmd->cachedSemaValue, payload);

    return CUDA_WRAPCOMPARE_A_GE_B(payload, qmdSemaphoreGetFinalPayloadForLaunch(qmd, launchId));
}

static NvBool
qmdIsCompleted(CUqmd *qmd)
{
    CU_TRACE_FUNCTION();
    CU_ASSERT(qmd);

    return qmdIsCompletedForLaunch(qmd, qmdGetLaunchId(qmd));
}

CUnvCurrent *
qmdSemaphoreInitializeForLaunch(CUnvCurrent *nvCurrent, CUctx *ctx, CUqmd *qmd)
{
    NvU32 qmdSemaphoreInitialPayload;

    CU_TRACE_FUNCTION();
    CU_ASSERT(nvCurrent);
    CU_ASSERT(ctx);
    CU_ASSERT(qmd);

    // Initialize the qmd semaphore before the launch. We use inline memcpy methods
    // to initialize the semaphore instead of a semaphore release or a host side write
    // because:
    // (1) Avoiding a host side write of the semaphore allows us to turn on L2 caching
    //     of qmd semaphores, which improves launch throughput significantly.
    // (2) An I2M is faster than a semaphore release when we need to guarantee the sempahore
    //     write is immediately visible to the SMs. If we use a compute semaphore release,
    //     a compute sysmembar needs to pushed after the release to guarantee that the
    //     write is visible before the following PCAS schedule/invalidate, which can be quite
    //     slow. If we use an I2M to initialize the semphore, we don't need a sysmembar at all.
    //
    //     Compute I2Ms (inline memcpys) are special because they always wait for a crossbar
    //     ack before proceeding to the next method. This means that after an I2M has been
    //     completed (event one without a flush) those writes from the memcpy are guaranteed
    //     to be visible to all units in FE's virtual channel, i.e. FE, SKED, and the SMs.
    //
    qmdSemaphoreInitialPayload = qmdSemaphoreGetInitialPayload(qmd);
    return ctx->device->hal.memcpyInlineHtoD1D(
        nvCurrent,
        ctx,
        semaphoreGetOffset(qmd->semaphore),
        &qmdSemaphoreInitialPayload,
        sizeof(qmdSemaphoreInitialPayload),
        CUI_MEMCPY_MEMBAR_TYPE_NONE);
}

void
qmdGetCompletionMarkerEntry(CUqmd *qmd, CUnvchannel *channel, NvU64 trackSemValue, CUctxMarkerEntry *entry_out)
{
    entry_out->type = CU_CTX_MARKER_ENTRY_TYPE_PENDING_QMD;
    entry_out->data.pendingQmd.qmd = qmd;
    entry_out->data.pendingQmd.launchId = qmd->launch.id;
    entry_out->data.pendingQmd.launch.channel = channel;
    entry_out->data.pendingQmd.launch.trackingSemaphoreValue = trackSemValue;
}

CUctxMarkerStatus
qmdGetStatusForMarker(CUctxMarkerPendingQMD *pendingQmd)
{
    CUqmd *qmd = pendingQmd->qmd;
    NvU64 pendingLaunchId = pendingQmd->launchId;
    CUnvchannel *channel = pendingQmd->launch.channel;

    if (!channel) {
        // If the returned channel is NULL but the QMD hasn't been reused, the
        // pending qmd has been added to a marker but the push hasn't been
        // finished yet.
        return CU_CTX_MARKER_METHODS_NOT_PUSHED;
    }

    if (!trackingSemaHasFlushedValue(&channel->trackingSemaphoreData, pendingQmd->launch.trackingSemaphoreValue)) {
        return CU_CTX_MARKER_METHODS_NOT_FLUSHED_TO_GPU;
    }

    if (qmdIsCompletedForLaunch(qmd, pendingLaunchId)) {
        CUchannelManager *channelManager = qmd->pool->ctx->channelManager;

        // If this QMD is not tracking any subsequent, unfinished
        // kernels, we would want to remove it from channel
        // manager's active QMD list to avoid querying it on
        // every ctxSynchronize.
        cuiMutexLock(&channelManager->markerMutex);
        if (qmdIsCompleted(qmd)) {
            channelManagerRemoveActiveQmdList_UnderLock(channelManager, qmd);
        }
        cuiMutexUnlock(&channelManager->markerMutex);

        // In any case, this QMD indicates that the kernel with the
        // given launchId is finished
        return CU_CTX_MARKER_COMPLETED_BY_GPU;
    }

    return CU_CTX_MARKER_METHODS_FLUSHED_TO_GPU;
}

// Called during qmdAllocateFailsafe where we realize we don't have enough
// QMD's in the free list to allocate. This iterates over pool's QMD's and
// frees up those QMD's that are no longer associated to an owner (i.e.,
// !allocated) and are complete.
static void
qmdFreeCompleted_underQmdLock(CUqmdPool *pool)
{
    NvU32 i;

    CU_TRACE_FUNCTION();
    CU_ASSERT(pool);
    for (i = 0; i < pool->poolSize; ++i) {
        CUqmd *qmd = &pool->pool[i];
        if (!qmd->allocated && qmdIsCompletedForLaunch(qmd->completionQmd, qmd->completionQmdLaunchId)) {
            qmdFree_underQmdLock(qmd);
        }
    }
}

// Deallocates count qmds from the owner (e.g., a stream)
void
qmdDeallocate(CUctx *ctx, CUqmd **qmds, NvU32 count)
{
    NvU32 i;
    CU_ASSERT(ctx);
    CU_ASSERT(qmds);

    cuiMutexLock(&ctx->qmdPoolsMutex);

    for (i = 0; i < count; ++i) {
        // Marking QMD as !allocated allows qmdAllocateFailsafe
        // to free it later on. Note that the QMD might not be
        // finished yet when the owner is deallocating it (e.g.,
        // when stream is detaching), so it's not a bad idea to
        // give it more time (to potentially finish) and reclaim
        // it later in qmdAllocateFailsafe, when needed.
        if (qmds[i]) {
            CUqmd *qmd = qmds[i];
            qmd->allocated = NV_FALSE;
            if (qmd->completionQmd == NULL) {
                qmd->completionQmdLaunchId = qmdGetLaunchId(qmd);
                qmd->completionQmd = qmd;
            }
        }
    }

    cuiMutexUnlock(&ctx->qmdPoolsMutex);
}

// Allocate count number of QMD's for an owner (e.g., a stream).
// Allocate a new pool if not enough QMD's are available.
CUresult qmdAllocateFailsafe(CUctx *ctx, NvU32 count, CUqmd **outQmds)
{
    CUresult status = CUDA_SUCCESS;
    NvU32 numQmdsAllocated = 0;

    CU_TRACE_FUNCTION();
    CU_ASSERT(ctx);
    CU_ASSERT(outQmds);

    cuiMutexLock(&ctx->qmdPoolsMutex);

    CUqmdPool *p = ctx->qmdPool;
    CUqmdPool *pool;

    // 1. See if there are enough free QMD's in the existing QMD
    // pools' free lists.
    pool = p;
    do {
        numQmdsAllocated += qmdAllocateFromPool_underQmdLock(pool, (count - numQmdsAllocated), outQmds + numQmdsAllocated);
        if (numQmdsAllocated == count) {
            goto Done;
        }

        pool = pool->next;

    } while (pool);


    // 2. Not enough free QMD's in the pools' free lists. But let's
    // take a closer look at the pools and recalim abandoned QMDs
    // by destroyed owners.
    pool = p;
    do {
        qmdFreeCompleted_underQmdLock(pool);
        numQmdsAllocated += qmdAllocateFromPool_underQmdLock(pool, (count - numQmdsAllocated), outQmds + numQmdsAllocated);
        if (numQmdsAllocated == count) {
            goto Done;
        }

        pool = pool->next;
    } while (pool);


    // 3. Not enough abandoned QMD's. Let's allocate one/several
    // new pool(s) of QMD's to allocate the remaining QMD's from.
    // Note that resizing the existing QMD pool is another option,
    // but a very difficult one, since we need to resize QMD
    // device storage and its associated semaphore pool, both of
    // which are largely in-use at this point.
    do {
        status = qmdPoolCreate(p->ctx, &pool);
        if (CUDA_SUCCESS != status) {
            goto Done;
        }

        // Add the new pool to the linked list of ctx's QMD pools
        CUDA_LIST_INSERT_PREVNEXT(p->ctx->qmdPool, pool, prev, next);

        // We should be able to allocate up to QMD_POOL_SIZE QMD's here
        numQmdsAllocated += qmdAllocateFromPool_underQmdLock(pool, (count - numQmdsAllocated), outQmds + numQmdsAllocated);
        if (numQmdsAllocated == count) {
            goto Done;
        }
    } while (NV_TRUE);

Done:
    cuiMutexUnlock(&ctx->qmdPoolsMutex);
    return status;
}

// Invalidate SKED's QMD cache
CUresult qmdInvalidateSkedCache(CUctx *ctx, NvU64 qmdPoolVaddr)
{
    CUresult status = CUDA_SUCCESS;
    CUnvchannel *channel = NULL;
    CUnvCurrent *nvCurrent = NULL;

    CU_ASSERT(ctx);
    CU_ASSERT(qmdPoolVaddr);

    if (!ctx->device->state.supportsCnp) {
        // Only GPUs supporting CNP have a cache in SKED to cache QMDs.
        // For the rest, there's nothing to invalidate.
        return CUDA_SUCCESS;
    }

    if (cuiGlobalsIsLegacyMpsServer()) {
        // The mps server may be invalidating the cache on a client's behalf.
        // So acquire a marker in the barrier stream to make sure that the
        // invalidate and wait-for-idle being pushed (below) will be executed after
        // all client's work has finished. This is necessary because the mps server's
        // barrier stream does not implicitly synchronize with client's work.
        mpsServerAcquirePendingWorkMarker(ctx);
    }

    streamBeginPush(ctx->streamManager, CU_CHANNEL_COMPUTE, ctx->barrierStream, &nvCurrent, &channel);
    nvCurrent = ctx->device->hal.invalSkedCache(nvCurrent, qmdPoolVaddr, ctx->qmdPool->poolSize);
    streamEndPush(ctx->barrierStream, nvCurrent, NULL);

    status = cuiCtxSynchronize(ctx);

    return status;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值