QMD是CUDA 用来追踪kernel launch的,主要有两部分组成QMD和QMDPOOL
CODE OVERVIE
QMD POOL
// QMD pool, including headers, the QMD themselves, and other meta-data
struct CUqmdPool_st
{
CUctx *ctx; // The owning context
CUmemobj *poolMemobj; // memory for the device hw QMDs
CUsemaPool *qmdSemaphores;
NvBool hasUnregisteredSemaPages; // If the associated semaphore pages are not made portable
CUqmd *pool; // QMD headers, one per underlying QMD in the memobj
NvU32 poolSize; // number of QMDs in the pool
CUqmd **freeStack; // stack of free entries in the pool
NvU32 freeStackTop; // top of the stack
CUqmdPool *next; // Next and previous pools, if we've added more
CUqmdPool *prev; // pools than the one created upon ctx creation.
};
CUresult qmdPoolCreate(CUctx *ctx, CUqmdPool **pool);
void qmdPoolDestroy(CUqmdPool *p);
CUresult qmdRegisterSemaphorePoolIfNeeded(CUqmd **qmds, NvU32 count);
QMDPool Create
这里面最关键的是创建了一个semaphorePoolCreate()
CUresult qmdPoolCreate(CUctx *ctx, CUqmdPool **pool)
{
CUresult status = CUDA_SUCCESS;
CUqmdPool *p = NULL;
CUmemobj *memobj = NULL;
CUmemdesc memdesc;
NvU64 qmdSize = 0;
NvU64 devVABase;
NvU32 i;
NvBool enableCacheQmdSemaPool;
CU_TRACE_FUNCTION();
CU_ASSERT(pool);
*pool = NULL;
// initialize pool only on supported devices
if (ctx->device->state.major < 3) {
return CUDA_SUCCESS;
}
// allocate pool
p = (CUqmdPool *)malloc(sizeof(*p));
if (!p) {
CU_ERROR_PRINT(("Failed to malloc QMD pool\n"));
status = CUDA_ERROR_OUT_OF_MEMORY;
goto Error;
}
memset(p, 0, sizeof(*p));
p->ctx = ctx;
// allocate pool cpu data structure
p->poolSize = QMD_POOL_SIZE;
#if !(defined(PUBLIC_RELEASE) && defined(RELEASE))
if (globals.qmdPoolSize) {
p->poolSize = globals.qmdPoolSize;
CU_DEBUG_PRINT(("Setting QMD pool size to: %u\n", p->poolSize));
}
#endif
enableCacheQmdSemaPool = cuiDeviceSupportsSemaphoreCaching(ctx->device);
// Create a semaphore pool with gpu caching enabled.
status = semaphorePoolCreate(ctx, CU_SEMA_TYPE_QMD, enableCacheQmdSemaPool, &p->qmdSemaphores);
if (status != CUDA_SUCCESS) {
CU_ERROR_PRINT(("Failed to create the QMD semaphore pool\n"));
goto Error;
}
p->hasUnregisteredSemaPages = NV_TRUE;
p->pool = (CUqmd *)malloc(p->poolSize * sizeof(*(p->pool)));
if (!p->pool) {
CU_ERROR_PRINT(("Failed to malloc pool CPU data structure\n"));
goto Error;
}
memset(p->pool, 0, p->poolSize * sizeof(*(p->pool)));
qmdSize = ctx->device->hal.getConstant(CU_HAL_CONST_QMD_DEVICE_SIZE);
// allocate pool memory on device. don't suballocate because if the application
// is a mps client, and it doesn't exit cleanly, the mps server will have to
// invalidate SKED's QMD cache on the client's behalf. Not suballocating helps the
// server identify the base address of the QMD pool correctly.
// QMD must be 40-bit VA.
memset(&memdesc, 0x0, sizeof(memdesc));
memdesc.flags.location = CU_MEM_LOCATION_DEVICE;
memdesc.flags.mapHost = CU_MEM_MAP_HOST_NONE;
memdesc.flags.mapDevice = CU_MEM_MAP_DEVICE_PTR_FORCE_40_BIT;
memdesc.flags.type = CU_MEM_TYPE_QMD;
memdesc.flags.owner = CU_MEM_OWNER_DRIVER;
memdesc.flags.noSuballoc = NV_TRUE;
status = memobjAlloc(ctx->memmgr, &memdesc, qmdSize * p->poolSize, &memobj);
if (CUDA_SUCCESS != status) {
CU_ERROR_PRINT(("Failed to allocate QMD memobj\n"));
goto Error;
}
CU_TRACE_PRINT(("Allocated QMD memobj on device\n"));
p->poolMemobj = memobj;
devVABase = memobjGetDeviceVaddr(p->poolMemobj);
// initialize each QMD
for (i = 0; i < p->poolSize; i++) {
CUqmd *q = &p->pool[i];
q->pool = p;
q->devVA = devVABase + i * qmdSize;
q->size = qmdSize;
status = semaphoreAlloc(p->qmdSemaphores, &q->semaphore);
if (CUDA_SUCCESS != status) {
CU_ERROR_PRINT(("Failed to allocate semaphore for QMD %u\n", i));
goto Error;
}
}
// allocate the free stack
p->freeStack = (CUqmd **)malloc(p->poolSize * sizeof(*(p->freeStack)));
if (!p->freeStack) {
CU_ERROR_PRINT(("Failed to allocate pool free stack\n"));
goto Error;
}
for (i = 0; i < p->poolSize; i++) {
p->freeStack[i] = &p->pool[i];
}
p->freeStackTop = 0;
*pool = p;
return status;
Error:
qmdPoolDestroy(p);
return status;
}
QMDPool Destroy
void qmdPoolDestroy(CUqmdPool *p)
{
CU_TRACE_FUNCTION();
if (p) {
if (p->pool) {
NvU32 i;
for (i = 0; i < p->poolSize; i++) {
if (p->pool[i].semaphore) {
semaphoreFree(p->pool[i].semaphore);
}
}
memset(p->pool, 0, p->poolSize * sizeof(*(p->pool)));
free(p->pool);
}
if (p->poolMemobj) {
memobjFree(&(p->poolMemobj));
}
if (p->freeStack) {
free(p->freeStack);
}
semaphorePoolDestroy(&p->qmdSemaphores);
memset(p, 0, sizeof(*p));
free(p);
}
}
QMD
QMD其实是对CUsema 的一个wrap,
// header and tracking information for each QMD
struct CUqmd_st
{
CUqmdPool *pool; // The owning pool
NvU64 devVA; // device-side QMD
NvU64 size; // size in bytes of the HW QMD
CUsema *semaphore; // semaphore tracking completion
NvU32 cachedSemaValue; // Last known value of the QMD semaphore
NvBool allocated; // if the QMD is allocated to an owner (e.g., a stream)
struct // per-launch data
{
// a unique identifier for the launch.
// Updated atomically so that it can be read w/o holding the marker mutex
NvU64 id;
// the last launch id that has done its ECC check
// - only updated while holding CUctx::errorCheckMutex
NvU64 lastEccCheckedLaunchId;
} launch;
NvBool isInActiveList; // if the QMD is in activeQmdList in channelManager
CUqmd *activeListPrev; // previous QMD in activeQmdList
CUqmd *activeListNext; // next QMD in activeQmdList
CUqmd *completionQmd; // This points to a different QMD when the current QMD skips its semaphore release
NvU64 completionQmdLaunchId;// The launch which the completionQMD must reach to signify this QMDs completion.
};
// Attempts to allocate count QMD's from the given pool. It will return the
// number of QMDs that were actually allocated.
static NvU32
qmdAllocateFromPool_underQmdLock(CUqmdPool *p, NvU32 count, CUqmd **outQmds)
{
NvU32 i;
NvU32 numQmdsToAllocate;
CU_TRACE_FUNCTION();
CU_ASSERT(p);
CU_ASSERT(outQmds);
numQmdsToAllocate = ((p->poolSize - p->freeStackTop) >= count)? count : (p->poolSize - p->freeStackTop);
for (i = 0; i < numQmdsToAllocate; ++i) {
outQmds[i] = p->freeStack[p->freeStackTop];
p->freeStackTop += 1;
outQmds[i]->allocated = NV_TRUE;
outQmds[i]->completionQmd = NULL;
outQmds[i]->completionQmdLaunchId = 0;
}
return numQmdsToAllocate;
}
// Adds the QMD back to the free list of the pool
static void
qmdFree_underQmdLock(CUqmd *qmd)
{
CUqmdPool *p = qmd->pool;
CUchannelManager *channelManager = p->ctx->channelManager;
CU_TRACE_FUNCTION();
CU_ASSERT(p);
CU_ASSERT(p->freeStackTop > 0);
CU_ASSERT(qmd);
// The QMD may still exist in the channel manager active QMD list.
// Be sure to remove it from the channel manager list
cuiMutexLock(&channelManager->markerMutex);
channelManagerRemoveActiveQmdList_UnderLock(channelManager, qmd);
cuiMutexUnlock(&channelManager->markerMutex);
p->freeStackTop -= 1;
p->freeStack[ p->freeStackTop ] = qmd;
}
FUNC
从下面func可以看出其和Semaphore强相关
/**
* The QMD semaphore payload for each launch can reach three distinct values.
* They are all offset from the base which is launch.id * 4 (CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH).
* Initially it is set to 1 (CUI_QMD_SEMAPHORE_STATE_INITIAL), once
* the kernel is finished it is set to 2 (CUI_QMD_SEMAPHORE_STATE_FINAL), and while the kernel is running
* it can also reach 0 (CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP). The temporary state is only reached
* if the kernel starts any CNP work.
*
* Having a distinct value for CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP makes it easy to distinguish
* between a semaphore in the CUI_QMD_SEMAPHORE_STATE_FINAL state and the CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP
* state just by looking at the payload. This allows for an easy implementation of qmdSemaphoreIsFinalPayload()
* which is used by the debugger.
*
* See qmdSemaphoreGetInitialPayload(), qmdSemaphoreGetFinalPayload() and qmdSemaphoreIsFinalPayload().
*/
#define CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH 4
// The state base increment has to be a power of 2 so that semaphore overflow doesn't break
// the property of payload modulo CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH being the current state.
// This property is assumed in qmdSemaphoreIsFinalPayload().
ct_assert((CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH & (CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH - 1)) == 0);
#define CUI_QMD_SEMAPHORE_STATE_TEMPORARY_WITH_CNP 0
#define CUI_QMD_SEMAPHORE_STATE_INITIAL 1
#define CUI_QMD_SEMAPHORE_STATE_FINAL 2
// Get the initial payload used for the semaphore for current launch
NvU32 qmdSemaphoreGetInitialPayload(const CUqmd *qmd);
// Get the final payload used for the semaphore for current launch
NvU32 qmdSemaphoreGetFinalPayload(const CUqmd *qmd);
// Get the final payload used for the semaphore for a launch
NvU32 qmdSemaphoreGetFinalPayloadForLaunch(const CUqmd *qmd, NvU64 launchId);
// Returns whether a payload is a final one
NvBool qmdSemaphoreIsFinalPayload(NvU32 payload);
// Returns the difference between the final and initial payload
NvU32 qmdSemaphoreGetDifferenceBetweenFinalAndInitial(const CUqmd *qmd);
// Initialize the QMD's semaphore for a launch
CUnvCurrent *qmdSemaphoreInitializeForLaunch(CUnvCurrent *nvCurrent, CUctx *ctx, CUqmd *qmd);
// Get the completion marker entry for a QMD
void qmdGetCompletionMarkerEntry(CUqmd *qmd, CUnvchannel *channel, NvU64 trackSemValue, CUctxMarkerEntry *entry_out);
// Deallocate count QMD's from their owner (e.g., stream)
void qmdDeallocate(CUctx *ctx, CUqmd **qmds, NvU32 count);
// return true if the specific launch is complete
NvBool qmdIsCompletedForLaunch(CUqmd *qmd, NvU64 launchId);
// Atomically read the launch.id of the given QMD
NvU64 qmdGetLaunchId(const CUqmd *qmd);
// return the marker status of the qmd for the pendingLaunchId and free the qmd if it's completed
CUctxMarkerStatus qmdGetStatusForMarker(CUctxMarkerPendingQMD *pendingQmd);
// Invalidate SKED's QMD cache
// For GM20x+ chips, this invalidates all QMD caches.
// For pre-GM20x chips, this only invalidates driver QMD pool, and
// doesn't invalidate CNP qmd pool.
CUresult qmdInvalidateSkedCache(CUctx *ctx, NvU64 qmdPoolVaddr);
QMD与Sem的操作
NvU64 qmdGetLaunchId(const CUqmd *qmd)
{
return cuosAtomicReadSeqCst64(&qmd->launch.id);
}
NvU32 qmdSemaphoreGetInitialPayload(const CUqmd *qmd)
{
return (NvU32)(qmdGetLaunchId(qmd) * CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH + CUI_QMD_SEMAPHORE_STATE_INITIAL);
}
NvU32 qmdSemaphoreGetFinalPayloadForLaunch(const CUqmd *qmd, NvU64 launchId)
{
return (NvU32)(launchId * CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH + CUI_QMD_SEMAPHORE_STATE_FINAL);
}
NvU32 qmdSemaphoreGetFinalPayload(const CUqmd *qmd)
{
return qmdSemaphoreGetFinalPayloadForLaunch(qmd, qmdGetLaunchId(qmd));
}
NvU32 qmdSemaphoreGetDifferenceBetweenFinalAndInitial(const CUqmd *qmd)
{
return CUI_QMD_SEMAPHORE_STATE_FINAL - CUI_QMD_SEMAPHORE_STATE_INITIAL;
}
NvBool qmdSemaphoreIsFinalPayload(NvU32 payload)
{
return (payload % CUI_QMD_SEMAPHORE_STATE_BASE_INCREMENT_BY_LAUNCH) == CUI_QMD_SEMAPHORE_STATE_FINAL;
}
// if launchId is zero, then this QMD has never been used
if (launchId == 0) {
return NV_TRUE;
}
// If the cached semaphore value shows completion for this launchId,
// there's no need to access the actual semaphore payload
if (CUDA_WRAPCOMPARE_A_GE_B(cuosAtomicReadAcquire32(&qmd->cachedSemaValue), qmdSemaphoreGetFinalPayloadForLaunch(qmd, launchId))) {
return NV_TRUE;
}
// Cache semaphore value.
// Here we may write a stale value to cachedSemaValue due to a race between
// multiple threads. However that is fine because the thread(s) that are
// waiting to see a higher value will skip the above if and get here again
// to update cachedSemaValue.
payload = semaphoreGetPayload(qmd->semaphore);
cuosAtomicWriteRelease32(&qmd->cachedSemaValue, payload);
return CUDA_WRAPCOMPARE_A_GE_B(payload, qmdSemaphoreGetFinalPayloadForLaunch(qmd, launchId));
}
static NvBool
qmdIsCompleted(CUqmd *qmd)
{
CU_TRACE_FUNCTION();
CU_ASSERT(qmd);
return qmdIsCompletedForLaunch(qmd, qmdGetLaunchId(qmd));
}
CUnvCurrent *
qmdSemaphoreInitializeForLaunch(CUnvCurrent *nvCurrent, CUctx *ctx, CUqmd *qmd)
{
NvU32 qmdSemaphoreInitialPayload;
CU_TRACE_FUNCTION();
CU_ASSERT(nvCurrent);
CU_ASSERT(ctx);
CU_ASSERT(qmd);
// Initialize the qmd semaphore before the launch. We use inline memcpy methods
// to initialize the semaphore instead of a semaphore release or a host side write
// because:
// (1) Avoiding a host side write of the semaphore allows us to turn on L2 caching
// of qmd semaphores, which improves launch throughput significantly.
// (2) An I2M is faster than a semaphore release when we need to guarantee the sempahore
// write is immediately visible to the SMs. If we use a compute semaphore release,
// a compute sysmembar needs to pushed after the release to guarantee that the
// write is visible before the following PCAS schedule/invalidate, which can be quite
// slow. If we use an I2M to initialize the semphore, we don't need a sysmembar at all.
//
// Compute I2Ms (inline memcpys) are special because they always wait for a crossbar
// ack before proceeding to the next method. This means that after an I2M has been
// completed (event one without a flush) those writes from the memcpy are guaranteed
// to be visible to all units in FE's virtual channel, i.e. FE, SKED, and the SMs.
//
qmdSemaphoreInitialPayload = qmdSemaphoreGetInitialPayload(qmd);
return ctx->device->hal.memcpyInlineHtoD1D(
nvCurrent,
ctx,
semaphoreGetOffset(qmd->semaphore),
&qmdSemaphoreInitialPayload,
sizeof(qmdSemaphoreInitialPayload),
CUI_MEMCPY_MEMBAR_TYPE_NONE);
}
void
qmdGetCompletionMarkerEntry(CUqmd *qmd, CUnvchannel *channel, NvU64 trackSemValue, CUctxMarkerEntry *entry_out)
{
entry_out->type = CU_CTX_MARKER_ENTRY_TYPE_PENDING_QMD;
entry_out->data.pendingQmd.qmd = qmd;
entry_out->data.pendingQmd.launchId = qmd->launch.id;
entry_out->data.pendingQmd.launch.channel = channel;
entry_out->data.pendingQmd.launch.trackingSemaphoreValue = trackSemValue;
}
CUctxMarkerStatus
qmdGetStatusForMarker(CUctxMarkerPendingQMD *pendingQmd)
{
CUqmd *qmd = pendingQmd->qmd;
NvU64 pendingLaunchId = pendingQmd->launchId;
CUnvchannel *channel = pendingQmd->launch.channel;
if (!channel) {
// If the returned channel is NULL but the QMD hasn't been reused, the
// pending qmd has been added to a marker but the push hasn't been
// finished yet.
return CU_CTX_MARKER_METHODS_NOT_PUSHED;
}
if (!trackingSemaHasFlushedValue(&channel->trackingSemaphoreData, pendingQmd->launch.trackingSemaphoreValue)) {
return CU_CTX_MARKER_METHODS_NOT_FLUSHED_TO_GPU;
}
if (qmdIsCompletedForLaunch(qmd, pendingLaunchId)) {
CUchannelManager *channelManager = qmd->pool->ctx->channelManager;
// If this QMD is not tracking any subsequent, unfinished
// kernels, we would want to remove it from channel
// manager's active QMD list to avoid querying it on
// every ctxSynchronize.
cuiMutexLock(&channelManager->markerMutex);
if (qmdIsCompleted(qmd)) {
channelManagerRemoveActiveQmdList_UnderLock(channelManager, qmd);
}
cuiMutexUnlock(&channelManager->markerMutex);
// In any case, this QMD indicates that the kernel with the
// given launchId is finished
return CU_CTX_MARKER_COMPLETED_BY_GPU;
}
return CU_CTX_MARKER_METHODS_FLUSHED_TO_GPU;
}
// Called during qmdAllocateFailsafe where we realize we don't have enough
// QMD's in the free list to allocate. This iterates over pool's QMD's and
// frees up those QMD's that are no longer associated to an owner (i.e.,
// !allocated) and are complete.
static void
qmdFreeCompleted_underQmdLock(CUqmdPool *pool)
{
NvU32 i;
CU_TRACE_FUNCTION();
CU_ASSERT(pool);
for (i = 0; i < pool->poolSize; ++i) {
CUqmd *qmd = &pool->pool[i];
if (!qmd->allocated && qmdIsCompletedForLaunch(qmd->completionQmd, qmd->completionQmdLaunchId)) {
qmdFree_underQmdLock(qmd);
}
}
}
// Deallocates count qmds from the owner (e.g., a stream)
void
qmdDeallocate(CUctx *ctx, CUqmd **qmds, NvU32 count)
{
NvU32 i;
CU_ASSERT(ctx);
CU_ASSERT(qmds);
cuiMutexLock(&ctx->qmdPoolsMutex);
for (i = 0; i < count; ++i) {
// Marking QMD as !allocated allows qmdAllocateFailsafe
// to free it later on. Note that the QMD might not be
// finished yet when the owner is deallocating it (e.g.,
// when stream is detaching), so it's not a bad idea to
// give it more time (to potentially finish) and reclaim
// it later in qmdAllocateFailsafe, when needed.
if (qmds[i]) {
CUqmd *qmd = qmds[i];
qmd->allocated = NV_FALSE;
if (qmd->completionQmd == NULL) {
qmd->completionQmdLaunchId = qmdGetLaunchId(qmd);
qmd->completionQmd = qmd;
}
}
}
cuiMutexUnlock(&ctx->qmdPoolsMutex);
}
// Allocate count number of QMD's for an owner (e.g., a stream).
// Allocate a new pool if not enough QMD's are available.
CUresult qmdAllocateFailsafe(CUctx *ctx, NvU32 count, CUqmd **outQmds)
{
CUresult status = CUDA_SUCCESS;
NvU32 numQmdsAllocated = 0;
CU_TRACE_FUNCTION();
CU_ASSERT(ctx);
CU_ASSERT(outQmds);
cuiMutexLock(&ctx->qmdPoolsMutex);
CUqmdPool *p = ctx->qmdPool;
CUqmdPool *pool;
// 1. See if there are enough free QMD's in the existing QMD
// pools' free lists.
pool = p;
do {
numQmdsAllocated += qmdAllocateFromPool_underQmdLock(pool, (count - numQmdsAllocated), outQmds + numQmdsAllocated);
if (numQmdsAllocated == count) {
goto Done;
}
pool = pool->next;
} while (pool);
// 2. Not enough free QMD's in the pools' free lists. But let's
// take a closer look at the pools and recalim abandoned QMDs
// by destroyed owners.
pool = p;
do {
qmdFreeCompleted_underQmdLock(pool);
numQmdsAllocated += qmdAllocateFromPool_underQmdLock(pool, (count - numQmdsAllocated), outQmds + numQmdsAllocated);
if (numQmdsAllocated == count) {
goto Done;
}
pool = pool->next;
} while (pool);
// 3. Not enough abandoned QMD's. Let's allocate one/several
// new pool(s) of QMD's to allocate the remaining QMD's from.
// Note that resizing the existing QMD pool is another option,
// but a very difficult one, since we need to resize QMD
// device storage and its associated semaphore pool, both of
// which are largely in-use at this point.
do {
status = qmdPoolCreate(p->ctx, &pool);
if (CUDA_SUCCESS != status) {
goto Done;
}
// Add the new pool to the linked list of ctx's QMD pools
CUDA_LIST_INSERT_PREVNEXT(p->ctx->qmdPool, pool, prev, next);
// We should be able to allocate up to QMD_POOL_SIZE QMD's here
numQmdsAllocated += qmdAllocateFromPool_underQmdLock(pool, (count - numQmdsAllocated), outQmds + numQmdsAllocated);
if (numQmdsAllocated == count) {
goto Done;
}
} while (NV_TRUE);
Done:
cuiMutexUnlock(&ctx->qmdPoolsMutex);
return status;
}
// Invalidate SKED's QMD cache
CUresult qmdInvalidateSkedCache(CUctx *ctx, NvU64 qmdPoolVaddr)
{
CUresult status = CUDA_SUCCESS;
CUnvchannel *channel = NULL;
CUnvCurrent *nvCurrent = NULL;
CU_ASSERT(ctx);
CU_ASSERT(qmdPoolVaddr);
if (!ctx->device->state.supportsCnp) {
// Only GPUs supporting CNP have a cache in SKED to cache QMDs.
// For the rest, there's nothing to invalidate.
return CUDA_SUCCESS;
}
if (cuiGlobalsIsLegacyMpsServer()) {
// The mps server may be invalidating the cache on a client's behalf.
// So acquire a marker in the barrier stream to make sure that the
// invalidate and wait-for-idle being pushed (below) will be executed after
// all client's work has finished. This is necessary because the mps server's
// barrier stream does not implicitly synchronize with client's work.
mpsServerAcquirePendingWorkMarker(ctx);
}
streamBeginPush(ctx->streamManager, CU_CHANNEL_COMPUTE, ctx->barrierStream, &nvCurrent, &channel);
nvCurrent = ctx->device->hal.invalSkedCache(nvCurrent, qmdPoolVaddr, ctx->qmdPool->poolSize);
streamEndPush(ctx->barrierStream, nvCurrent, NULL);
status = cuiCtxSynchronize(ctx);
return status;
}
151

被折叠的 条评论
为什么被折叠?



