这里写目录标题
// create two events
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// record start event on the default stream
cudaEventRecord(start);
// execute kernel
kernel<<<grid, block>>>(arguments);
// record stop event on the default stream
cudaEventRecord(stop);
// wait until the stop event completes
cudaEventSynchronize(stop);
// calculate the elapsed time between two events
float time;
cudaEventElapsedTime(&time, start, stop);
// clean up the two events
cudaEventDestroy(start);
cudaEventDestroy(stop);
参考:
https://blog.csdn.net/k916631305/article/details/112802166
Static design
This section describes static aspects of the CUSW_UNIT_EVENT unit’s architecture.
Overview of software element architecture
CUSW_UNIT_EVENT implements CUDA event objects.
A CUDA event represents a position in a CUDA stream. Pointing at a particular position in stream is known as recording the event. Inserting a wait into a stream on a previously recorded event is known as waiting for the event. An event cannot be waited upon before it has been recorded.
The record and wait can happen on two different streams. This allows dependencies to be created between streams.
CUDA events can also represent synchronization objects from other APIs. For such events the record does not happen on a CUDA stream. It happens when the other API sets up triggers for its corresponding synchronization object. The wait for such events can be inserted into CUDA streams as usual. These events thus allow efficient synchronization between CUDA and other APIs for interoperation.
CUDA events can do cross-process synchronization. In that case the record happens in one process while the wait happens in another process. This is achieved using inter-process communication (IPC).
The diagram below shows how CUSW_UNIT_EVENT relates to other units.

Abstractions
The CUevent struct represents a CUDA event. It wraps an underlying synchronization primitive along with flags and data to achieve record and wait operations.
A single-process event uses a marker object as its synchronization primitive.
When the event is recorded or created from a synchronization object provided by a non-CUDA API, a marker entry of appropriate type is added to the marker. See CUSW_UNIT_SYNC unit architecture for detailed description of marker and marker entries.
Interprocess events use shared semaphore memory instead of a marker object.
CUSW_UNIT_EVENT unit provides interface functions to * Create an event object. * Record an event into a stream. * Synchronize with the event i.e. wait till the event is marked complete and then return. * Query the event for its status — completed, not completed, error. * Destroy an event object and release resources allocated for the event.
Each function determines the type of underlying synchronization primitive and calls appropriate methods related to that primitive.
The following table captures the set of units that CUISYNC is dependent on, the functionality that it depends on those units for, and shared data (globals, configurations, etc), if any.

Code
定义
/**
* \brief Information about a synchronization point recorded by an event.
* Not used for IPC events.
*/
typedef struct CUIeventSyncPt_st {
//! protects all fields
CUImutex mutex;
//! the last recorded synchronization point
CUctxMarker* marker;
//! UVM DAG node for the last recorded synchronization point
CUIuvmDagNodeId uvmDagNodeId;
} CUIeventSyncPt;
/**
* \brief Possible types of events
*/
enum CUIeventType {
//! Standard CUDA event, recorded and consumed locally
CUI_EVENT_TYPE_LOCAL,
//! CUDA IPC event, created locally
CUI_EVENT_TYPE_IPC_LOCAL,
//! CUDA IPC event imported from another process
CUI_EVENT_TYPE_IPC_REMOTE
#if cuiIsEglSyncSupported()
,
//! CUDA event for EGL interop
CUI_EVENT_TYPE_EGL
#endif
#if cuiEventNvnSupported()
,
//! CUDA event for NVN interop
CUI_EVENT_TYPE_NVN
#endif
};
/**
* Possible states for an event. Currently only used for
* non-IPC, non-interop events.
*/
enum CUIlocalEventState {
//! The event does not currently represent any synchronization point.
CUI_EVENT_EMPTY = 0,
//! The event contains a synchronization point recorded in a
//! non-capturing stream.
CUI_EVENT_RECORDED,
//! The event contains a synchronization point recorded in a
//! stream that is being captured into a cuda graph.
CUI_EVENT_CAPTURED,
//! The event is currently invalid and can only be rerecorded or
//! destroyed. This is used for captured events once the containing
//! graph goes away (e.g. due to ending capture). This is distinct
//! from EMPTY, which represents a valid event with an empty set of
//! things to synchronize.
CUI_EVENT_INVALID
};
struct CUevent_st {
//! owning context
CUctx *ctx;
//! linked list of all events belonging to the context
CUevent prev, next;
//! Id is unique per process for the process's lifetime
NvU64 eventID;
//! the type of this event
enum CUIeventType type;
//! type-specific data
union {
//! data for CUI_EVENT_TYPE_LOCAL
struct {
//! is this event capturing timing information?
NvBool recordTimingData;
//! does this event use blocking sync
NvBool useBlockingSync;
//! semaphore used for this event's timestamp (only allocated if recordTimingData is true)
CUsema *sema;
//! current status of this event
enum CUIlocalEventState state;
//! the last recorded synchronization point, if isRecorded is true
CUIeventSyncPt syncPt;
//! stream capture data
//! All fields except graph are protected as part of the capture using
//! cuiGraphLockForCapture(capture.graph).
struct {
//! the graph the event was recorded in
CUIgraph *graph;
//! the set of nodes in the graph this event waits on
cuiContigHashSet deps;
//! the previous event captured in the same graph
CUevent prev;
//! the next event captured in the same graph
CUevent next;
} capture;
} local;
//! data for CUI_EVENT_TYPE_IPC_LOCAL and CUI_EVENT_TYPE_IPC_REMOTE
struct {
//! the pool the semaphore is allocated from
CUIipcSemaphorePool *ipcPool;
//! information about the semaphore released by this event
volatile CUsemaDataFourWords *semaData;
// TODO do IPC events follow standard re-record semantics???
//! the payload of the last release
volatile NvU32 *lastIssuedPayload;
//! these fields are for CUI_EVENT_TYPE_IPC_LOCAL only
struct {
//! does this event use blocking sync
NvBool useBlockingSync;
//! the semaphore released by this event
CUsema *sema;
} local;
// IPC events are just always assumed to have been recorded, so no isRecorded field.
} ipc;
#if cuiIsEglSyncSupported()
//! data for CUI_EVENT_TYPE_EGL
struct {
//! does this event use blocking sync
NvBool useBlockingSync;
//! the EGLSync handle from which the event is created
EGLSyncKHR eglSync;
//! EGL state
EGLGlobalState *globalEglState;
} egl;
#endif
#if cuiEventNvnSupported()
//! data for CUI_EVENT_TYPE_NVN
struct {
//! does this event use blocking sync
NvBool useBlockingSync;
//! the NVN event to wait on
CUINVNSyncData *syncData;
//! the synchronization point represented by this event
CUIeventSyncPt syncPt;
} nvn;
#endif
} data;
};
Function
/**
* \brief Init Check for Events
*
* \detailDescription cuiEventInitCheck function checks whether the hEvent is valid and
* also checks if there are any errors on the context.
*
* \param[in] hEvent - the event on which to operate
* \param[in] flags - optional flags include
* CUI_EVENT_INIT_CHECK_ALLOW_CAPTURED,
* CUI_EVENT_INIT_CHECK_ALLOW_INVALIDATED, and
* CUI_EVENT_INIT_CHECK_DONT_INVALIDATE_IF_CAPTURED
*
* \return CUresult - the result of the function.
*
* \retval
* CUDA_SUCCESS,
* CUDA_ERROR_INVALID_HANDLE,
* CUDA_ERROR_INVALID_CONTEXT,
* CUDA_ERROR_CONTEXT_IS_DESTROYED
*
* \additionalNotes
* cuiEventInitCheck function returns CUDA_ERROR_INVALID_HANDLE, if the hEvent is NULL.
*
* \implDetails
* cuiEventCheck function checks whether the hEvent is valid. It then calls the function
* cuiInitCheckCtx which returns error if there are any sticky errors on the context.
*
* \endfn
*/
CUDA_TEST_EXPORT CUresult cuiEventInitCheck(CUevent hEvent, unsigned flags);
/**
* \brief Create an Event
*
* \detailDescription cuiEventCreate function allocates and initializes the cuda
* event based on the flags. This function expects that the ctx is valid.
*
*
* \param[in] ctx - the context
* \param[in] Flags - the event flag creation parameters
* \param[out] ppEvent - the event
*
* \return CUresult - the result of the function.
*
* \retval
* CUDA_SUCCESS,
* CUDA_ERROR_OUT_OF_MEMORY
*
* \additionalNotes
* cuiEventCreate function allocates pEvent->sema from pEvent->ipcPool, if the
* pEvent->isIpc is true.
* \additionalNotes
* cuiEventCreate function sets the field pEvent->didRecord to true for ipc events.
* \additionalNotes
* For events which record timing data, cuiEventCreate function allocates the semaphore
* pEvent->sema from ctx->semaphores and sets the payload with CU_EVENT_PAYLOAD_DELIVERED.
*
* \implDetails cuiEventCreate function allocates memory for the cuda event. It initializes
* event with the default values and other fields depending upon the Flags passed. For non
* ipc events, it allocates a marker pEvent->syncPt.marker. For ipc events, it allocates
* the sema and sets the last issued payload. It then
* sets pEvent->eventID with a unique counter and adds the event to the event list.
*
* \endfn
*
*/
CUDA_TEST_EXPORT CUresult cuiEventCreate(CUctx *ctx, CUevent *ppEvent, unsigned int Flags);
/**
* \brief Create an ipc Event
*
* \detailDescription This function allocates memory for the ipc event and initializes it.
*
* \param[in] pool - the ipc semaphore pool
* \param[in] eventHandle - the ipc event handle
* \param[out] ppEvent - the event to be returned.
*
* \return CUresult - the result of the function
*
* \retval
* CUDA_SUCCESS,
* CUDA_ERROR_OUT_OF_MEMORY,
*
* \implDetails This function allocates the event, initializes it. It sets the pEvent->didRecord
* to true. It calls the function cuiIpcGetLastIssuedPayloadPtr and sets the
* pEvent->lastIssuedPayload. It also sets the event's semaData. It adds the event to context
* list.
*
* \endfn
*/
CUresult cuiEventIpcOpen(CUIipcSemaphorePool *pool, CUIipcEventHandle *eventHandle, CUevent *ppEvent);
/**
* \brief Record the event
*
* \detailDescription cuiEventRecord function records the event on the stream. This
* function expects the ctx to be valid.
*
* \param[in] ctx - the event's context
* \param[in] pEvent - the event on which to operate
* \param[in] pStream - the stream on which to record.
*
* \return CUresult - the result of the function
*
* \retval
* CUDA_SUCCESS
* CUDA_ERROR_OUT_OF_MEMORY, (cuiStreamGetDependencyMarkerOfNextWork)
* CUDA_ERROR_LAUNCH_FAILED,
* CUDA_ERROR_LAUNCH_TIMEOUT,
* CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
* CUDA_ERROR_ILLEGAL_ADDRESS,
* CUDA_ERROR_ECC_UNCORRECTABLE,
* CUDA_ERROR_MAP_FAILED,
* CUDA_ERROR_UNMAP_FAILED,
* CUDA_ERROR_NOT_INITIALIZED
* CUDA_ERROR_DEINITIALIZED,
*
* \additionalNotes
* For ipc event, cuiEventRecord function does a semaphore Acquire on the address
* devVaddr got from cuiIpcGetDevVaddr and payload value event->lastIssuedPayload+1 before
* doing a semaphore Release for this event
*
*
* \implDetails cuiEventRecord function first gets the channel on which we will push
* methods.
*
* For mps client and ipc event, it calls the semaphoreIncrementAndReleaseMPS
* For ipc events, it starts a new push on the channel, does a semaphore acquire on the
* event->semaData and and release the semaphore on compute or memcpy depending on the
* channel type.
* For Blocking sync events, the streamBeginpush is called, then the channelPushBlockingSyncAwaken
* which pushes a blocking sync and it ends the push by calling the streamEndPush.
* For record timing data events, it does a streamBeginpush. It calls the semaphoreReleaseAsyncMemcpy or semaphoreReleaseCompute depending on the channel type. It calls the streamEndPush
* to end the push.
* For default events, it checks whether the stream is null and calls the streamManagerUpdateNullStreamBarrier function.
*
* For all non-ipc event, it pushes the event->syncpt node in the uvm graph.
* It copies the marker to track the completion of work and sets the event->didRecord to true.
*
* \endfn
*/
CUDA_TEST_EXPORT CUresult cuiEventRecord(CUctx *ctx, CUevent pEvent, CUIstream *pStream);
/**
* \brief Destroy the event
*
* \detailDescription cuiEventDestroy function destroys or frees all the internal of the
* event i.e marker, ipc semaphore, semaphore for timing, nvrm syync for egl sync and
* destroys the event.
*
* \param[in] pEvent - the event to be destroyed.
* \additionalNotes
* cuiEventDestroy function frees the memory for the pEvent.
* \additionalNotes
* Abandon the semaphore if the event is ipc and the isIpcRemote is false.
*
* \implDetails cuiEventDestoy function removes the event from the list of event in the
* context. If the event is created from eglSync, then it calls the cuiEglStateDestroy. If the
* event is non-ipc and not used for eglSync, then it destroys the event's marker
* pEvent->syncPt.marker through ctxMarkerDestroy and abandons the semaphore pEvent->sema.
* It then frees the pEvent.
*
* \endfn
*/
CUDA_TEST_EXPORT CUresult cuiEventDestroy(CUevent pEvent);
/**
*
* \brief Query the event
*
* \detailDescription cuiEventQuery_r function queries for the current status of the
* event. It returns whether it is ready or not.
*
* \param[in] pEvent - The event to be synchronized.
* \param[out] uvmNodeIdSnapshot_out - The uvm dag node id which shall be returned
*
* \return CUresult - the result of the function
*
* \retval
* CUDA_SUCCESS,
* CUDA_ERROR_NOT_READY
*
* \additionalNotes
* cuiEventQuery_r function takes the lock syncPt.mutex before doing any read operation on the event->syncPt
* except for events created from eglSync.
* \additionalNotes
* For events created from eglSync, the EGL export API is called to get the latest nvrmSync object.
* This operation results in dynamic allocation which may impact the performance.
* It would get fixed in future releases. It queries the nvrmSync object and closes it.
* \additionalNotes
* For ipc events, Returns CUDA_ERROR_NOT_READY, if the event's lastIssuedPayload is not equal to event’s payload.
* \additionalNotes
* For non-ipc events, the function returns CUDA_ERROR_NOT_READY when the markerStatus is not
* equal to CU_CTX_MARKER_COMPLETED_BY_GPU.
*
* \implDetails
* cuiEventQuery_r function checks whether isIpc is true or not. For non-ipc event and not used for eglSync,
* it updates the uvmNodeIdSnapshot_out with the event->syncPt.uvmDagNodeId.
* According to the status of markerStatus, it returns CUDA_ERRRO_NOT_READY or CUDA_SUCCESS.
*
* \endfn
*/
CUDA_TEST_EXPORT CUresult cuiEventQuery_r(CUevent pEvent, NvU64 *uvmNodeIdSnapshot_out);
/**
* \brief Synchronize the event
*
* \detailDescription cuiEventSynchronize_r function blocks till the event has completed. If the event is ipc, then
* it spin loop till the event's payload is greater than the last issued payload. If the event is non ipc,
* then it blocks till the event->syncPt.marker’s marker status is CU_CTX_MARKER_COMPLETED_BY_GPU.
*
* \param[in] pEvent - The event to be synchronized
* \param[out] uvmNodeIdSnapshot_out The uvm dag node id in uvm dag which shall be returned
*
* \return CUresult - the result of the function
*
* \retval
* CUDA_SUCCESS
* CUDA_ERROR_OUT_OF_MEMORY, (ctxMarkerCopy)
* CUDA_ERROR_LAUNCH_FAILED, (ctxMarkerWait,ctxMarkerGetStatus)
* CUDA_ERROR_LAUNCH_TIMEOUT,
* CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
* CUDA_ERROR_ILLEGAL_ADDRESS,
* CUDA_ERROR_ECC_UNCORRECTABLE,
* CUDA_ERROR_MAP_FAILED,
* CUDA_ERROR_UNMAP_FAILED,
* CUDA_ERROR_NOT_INITIALIZED
* CUDA_ERROR_DEINITIALIZED,
* CUDA_ERROR_UNKNOWN.
*
* \implDetails cuiEventSynchronize_r function first checks whether the event is
* created from eglSync. Such events do not have marker associated with it.
* This function calls into EGL's export API to get the latest nvrmSync object.
* It then performs wait operation depending on the flag 'useBlokingSync' set at the
* time of event creation and closes the nvrmSync object.
* Next the function checks whether the event is interprocess.
* For ipc events it loops till the event's payload
* event->semaData->payload is equal to greater than the lastIssuedPayload. For non ipc
* events and not used for eglsync, it first allocates a local marker variable, then updates the
* uvmNodeIdSnapshot_out with uvmDagnodeId of the event's syncpoint and gets the
* markerStatus, if the markerStatus is not equal to CU_CTX_MARKER_COMPLETED_BY_GPU, then
* it updates the local marker with event->syncPt.marker and does a wait on the local
* marker. It then de-initializes the marker and returns the status.
*
*
* \endfn
*/
CUDA_TEST_EXPORT CUresult cuiEventSynchronize_r(CUevent pEvent, NvU64 *uvmNodeIdSnapshot_out);
/**
* \brief Notify User Synchronized
*
* \detailDescription cuiEventNotifyUserSynchornized function notifies the uvm that the
* event has completed. It also changes the memory object ownership.
*
* \param[in] pEvent - the event to operated on.
* \param[in] uvmNodeId - the uvm node id which has to be updated.
*
* \return
* CUresult - the result of the function.
*
* \retval
* CUDA_SUCCESS
*
* \retval CUDA_ERROR_OUT_OF_MEMORY,(cuiUvmDagSynchronizeNode)
* \retval CUDA_ERROR_OPERATING_SYSTEM,
* \retval CUDA_ERROR_UNKNOWN
*
* \implDetails cuiEventNotifyUserSynchronized function calls the dmal function
* deviceMemoryChaneExternalAcccess which changes the memory object ownership and flushes
* the cache. It then calls the functions cuiUvmDagSynchronizeNode which updates the uvm
* dag that the event has completed
*
* \endfn
*/
CUresult cuiEventNotifyUserSynchronized(CUevent pEvent, CUIuvmDagNodeId uvmNodeId);
/**
* \brief Invalidate a captured event
*
* \detailDescription cuiEventInvalidateCaptured marks a captured event invalid, clears
* its capture state, and removes it from the capturing graph's event list. It is an
* error to call this on an event that is not type CUI_EVENT_TYPE_LOCAL with state
* CUI_EVENT_CAPTURED.
*
* \param[in] pEvent - the event to invalidate
*
* \endfn
*/
void cuiEventInvalidateCaptured(CUevent pEvent);
create详细
CUresult
cuiEventCreate(CUctx *ctx, CUevent *ppEvent, unsigned int Flags)
{
CUresult status = CUDA_SUCCESS;
CUevent pEvent = NULL;
NvBool mutexInitialized = NV_FALSE;
CU_ASSERT(ctx);
CU_TRACE_FUNCTION();
pEvent = (CUevent)malloc(sizeof(*pEvent));
if (!pEvent) {
CU_DEBUG_PRINT(("Memory allocation failed\n"));
return CUDA_ERROR_OUT_OF_MEMORY;
}
cuosMemset(pEvent, 0, sizeof(*pEvent));
pEvent->ctx = ctx;
if (!(Flags & CU_EVENT_INTERPROCESS)) {
pEvent->type = CUI_EVENT_TYPE_LOCAL;
pEvent->data.local.useBlockingSync = (Flags & CU_EVENT_BLOCKING_SYNC) != 0;
pEvent->data.local.recordTimingData = (Flags & CU_EVENT_DISABLE_TIMING) == 0;
status = ctxMarkerCreate(&pEvent->data.local.syncPt.marker, ctx->channelManager);
if (CUDA_SUCCESS != status) {
goto Error;
}
cuiMutexInitialize(&pEvent->data.local.syncPt.mutex, CUI_MUTEX_ORDER_CU_EVENT, CUI_MUTEX_DEFAULT);
mutexInitialized = NV_TRUE;
if (pEvent->data.local.recordTimingData) {
status = semaphoreAlloc(ctx->semaphores, &pEvent->data.local.sema);
if (CUDA_SUCCESS != status) {
goto Error;
}
// initialize as delivered so that the first get-fresh-storage call is a NOP
semaphoreSetPayload(pEvent->data.local.sema, CU_EVENT_PAYLOAD_DELIVERED);
}
cuiContigHashSetInit(&pEvent->data.local.capture.deps, sizeof(CUIgraphNode *), 0);
}
else {
pEvent->type = CUI_EVENT_TYPE_IPC_LOCAL;
pEvent->data.ipc.local.useBlockingSync = (Flags & CU_EVENT_BLOCKING_SYNC) != 0;
status = cuiIpcGetLocalPool(ctx, &pEvent->data.ipc.ipcPool);
if (CUDA_SUCCESS != status) {
CU_DEBUG_PRINT(("Failed to create/get local IPC semaphore pool\n"));
goto Error;
}
// Get a semaphore from our special pool
status = semaphoreAlloc(pEvent->data.ipc.ipcPool->semaphores, &pEvent->data.ipc.local.sema);
if (CUDA_SUCCESS != status) {
goto Error;
}
// Store the shared memory area for host side coordination
pEvent->data.ipc.lastIssuedPayload = cuiIpcGetLastIssuedPayloadPtr(
pEvent->data.ipc.ipcPool,
semaphoreGetIndex(pEvent->data.ipc.local.sema));
// we'll never need new storage, so store our device Vaddr for later use
pEvent->data.ipc.semaData = semaphoreGetSemaDataFourWords(pEvent->data.ipc.local.sema);
// Initialize our payload (this is also the last issued payload)
*pEvent->data.ipc.lastIssuedPayload = CU_EVENT_PAYLOAD_INITIALIZED;
pEvent->data.ipc.semaData->payload = CU_EVENT_PAYLOAD_INITIALIZED;
}
pEvent->eventID = cuosInterlockedIncrement64(&globals.eventIdCounter);
// add to the context list
CUDA_LIST_INSERT(ctx->pEventList, pEvent);
toolsNotifyEventCreated(pEvent, ctx);
*ppEvent = pEvent;
return CUDA_SUCCESS;
Error:
if (pEvent) {
switch (pEvent->type) {
case CUI_EVENT_TYPE_LOCAL:
if (mutexInitialized) {
cuiMutexDeinitialize(&pEvent->data.local.syncPt.mutex);
}
if (pEvent->data.local.syncPt.marker) {
ctxMarkerDestroy(pEvent->data.local.syncPt.marker);
pEvent->data.local.syncPt.marker = NULL;
}
if (pEvent->data.local.sema) {
semaphoreFree(pEvent->data.local.sema);
pEvent->data.local.sema = NULL;
}
break;
case CUI_EVENT_TYPE_IPC_LOCAL:
if (pEvent->data.ipc.local.sema) {
semaphoreFree(pEvent->data.ipc.local.sema);
pEvent->data.ipc.local.sema = NULL;
}
break;
default:
CU_ASSERT(0);
break;
}
free(pEvent);
}
return status;
}
cuiEventRecord
调用栈如下:
cuiEventRecord
cuiStreamGetChannelOfLastWork
streamBeginPushOnChannel
streamEndPush
详细代码如下:
CUresult
cuiEventRecord(CUctx *ctx, CUevent event, CUIstream *stream)
{
CUresult status = CUDA_SUCCESS;
CUnvchannel* channel = NULL;
NvBool needsPush = cuiEventSupportsTiming(event) ||
cuiEventIsIpc(event) ||
(cuiEventUsesBlockingSync(event) &&
!ctx->channelManager->usesNvRmSyncForBlockingWait) ||
cuiEventIsNvn(event);
CU_TRACE_FUNCTION();
CU_ASSERT(ctx);
CU_ASSERT(!stream->isBarrierStream);
CU_ASSERT(cuiEventCanRecord(event));
if (cuiStreamIsCapturing(stream)) {
CU_ASSERT(event->type == CUI_EVENT_TYPE_LOCAL);
if (event->data.local.state == CUI_EVENT_CAPTURED) {
// Note that this is potentially a different lock than the one taken
// immediately after the if, in the case the event is being rerecorded
// to a new graph.
cuiGraphLockForCapture(event->data.local.capture.graph);
CUDA_LIST_REMOVE_PREVNEXT(event->data.local.capture.graph->capture.eventsHead,
event, data.local.capture.prev, data.local.capture.next);
cuiGraphUnlockForCapture(event->data.local.capture.graph);
event->data.local.capture.graph = NULL;
}
cuiGraphLockForCapture(stream->capture.graph);
status = cuiContigHashSetClearAndInsertAll(&event->data.local.capture.deps, &stream->capture.nextPushDeps);
if (status == CUDA_SUCCESS) {
CUIgraph *graph = stream->capture.graph;
event->data.local.state = CUI_EVENT_CAPTURED;
event->data.local.capture.graph = graph;
CUDA_LIST_INSERT_PREVNEXT(graph->capture.eventsHead, event, data.local.capture.prev, data.local.capture.next);
}
else {
event->data.local.state = CUI_EVENT_INVALID;
}
cuiGraphUnlockForCapture(stream->capture.graph);
return status;
}
else if (event->type == CUI_EVENT_TYPE_LOCAL && event->data.local.state == CUI_EVENT_CAPTURED) {
cuiGraphLockForCapture(event->data.local.capture.graph);
CUDA_LIST_REMOVE_PREVNEXT(event->data.local.capture.graph->capture.eventsHead,
event, data.local.capture.prev, data.local.capture.next);
cuiGraphUnlockForCapture(event->data.local.capture.graph);
event->data.local.capture.graph = NULL;
cuiContigHashSetClear(&event->data.local.capture.deps);
event->data.local.state = CUI_EVENT_EMPTY;
}
if (needsPush) {
// Select the channel on which we'll push methods (if needed).
// Start with the channel of last work to avoid impacting compute/memcpy overlap.
channel = cuiStreamGetChannelOfLastWork(stream);
// If no work has been submitted in the stream yet, use the channel manager to
// select a compute channel.
if (!channel) {
channel = channelManagerSelectChannelForPush(stream->ctx->channelManager, CU_CHANNEL_COMPUTE, stream);
}
}
// IPC semaphore releases must be handled on the server
if (cuiGlobalsIsLegacyMpsClient() && cuiEventIsIpc(event)) {
CU_ASSERT(needsPush);
CUmemobj *memobj = event->data.ipc.ipcPool->memobj;
semaphoreIncrementAndReleaseMPS(
channel,
stream,
memobjGetMemblock(memobj),
PTR2UINT(event->data.ipc.semaData) - PTR2UINT(memobjGetBlockHostPtr(memobj)),
PTR2UINT(event->data.ipc.lastIssuedPayload) - PTR2UINT(memobjGetBlockHostPtr(memobj)));
return CUDA_SUCCESS;
}
// push the semaphore and/or interrupt if needed
if (needsPush) {
CUnvCurrent* nvCurrent = NULL;
CUmemTrackListNode emptyNode = memTrackListNodeCreate(NULL, 0);
CUmemTrackList emptyList = memTrackListCreate(&emptyNode);
// If timing is supported, this is also the timing sema
CUsema *eventSema = cuiEventGetSema(event);
// get a new backing offset and initialize its data
if (cuiEventSupportsTiming(event)) {
CU_ASSERT(eventSema);
semaphoreRefreshStorage(eventSema, CU_EVENT_PAYLOAD_DELIVERED);
semaphoreSetPayload(eventSema, CU_EVENT_PAYLOAD_INITIALIZED);
}
streamBeginPushOnChannel(channel, stream, &nvCurrent);
// push the semaphore release
if (cuiEventSupportsTiming(event) || cuiEventIsIpc(event)) {
NvU64 devVaddr = 0;
NvU32 payload = CU_EVENT_PAYLOAD_DELIVERED;
if (cuiEventIsIpc(event)) {
devVaddr = cuiIpcGetDevVaddr(ctx, event->data.ipc.ipcPool, PTR2UINT(event->data.ipc.semaData));
payload = (NvU32) cuosInterlockedIncrement((volatile unsigned int *) event->data.ipc.lastIssuedPayload);
// There is a race between getNextPayload and pushing the semaphoreRelease
// push a semaphoreAcquire so we don't outrun a release in a different process
nvCurrent = ctx->device->hal.semaphoreAcquire(
nvCurrent,
0, 0,
ctx,
devVaddr,
NvOffset_LO32(payload - 1),
CUI_SEMAPHORE_ACQUIRE_FLAGS_NONE);
}
else {
devVaddr = semaphoreGetOffset(eventSema);
}
if (channelIsCompute(channel)) {
nvCurrent = ctx->device->hal.semaphoreReleaseCompute(
nvCurrent,
devVaddr,
payload,
0);
}
else {
nvCurrent = ctx->device->hal.semaphoreReleaseAsyncMemcpy(
nvCurrent,
devVaddr,
payload,
0);
}
}
#if cuiEventNvnSupported()
if (event->type == CUI_EVENT_TYPE_NVN) {
nvCurrent = cuiNVNSemaphoreRelease(channel,nvCurrent, event);
}
#endif
// push the interrupt generating method
if (cuiEventUsesBlockingSync(event) &&
!ctx->channelManager->usesNvRmSyncForBlockingWait)
{
channelPushBlockingSyncAwaken(channel, &nvCurrent);
}
streamEndPush(stream, nvCurrent, emptyList);
}
else {
// make sure that event records in the NULL stream act as barriers,
// even if the event record didn't do any work
if (stream->isNullStream) {
cuiStreamLock(stream);
status = streamManagerUpdateNullStreamBarrier(ctx->streamManager);
cuiStreamUnlock(stream);
if (status != CUDA_SUCCESS) {
return status;
}
}
}
// interprocess events don't need (and can't use) a marker
if (!cuiEventIsIpc(event)) {
CUIuvmDagNodeId oldNodeId;
CUIeventSyncPt *syncPt = cuiEventGetSyncPt(event);
CU_ASSERT(syncPt);
cuiMutexLock(&syncPt->mutex);
oldNodeId = syncPt->uvmDagNodeId;
// push a node in the UVM graph
status = cuiUvmDagEventRecord(stream, &syncPt->uvmDagNodeId);
if (status == CUDA_SUCCESS) {
// copy the marker to track completion of the work
status = cuiStreamGetDependencyMarkerOfNextWork(syncPt->marker, stream);
if (status != CUDA_SUCCESS) {
// keep sync point update transactional
syncPt->uvmDagNodeId = oldNodeId;
}
}
cuiMutexUnlock(&syncPt->mutex);
if (status != CUDA_SUCCESS) {
goto Error;
}
}
if (event->type == CUI_EVENT_TYPE_LOCAL) {
event->data.local.state = CUI_EVENT_RECORDED;
}
Error:
return status;
}
cuiEventSynchronize_r
CUresult
cuiEventSynchronize_r(CUevent event, NvU64 *uvmNodeIdSnapshot_out)
{
CUctx *ctx = event->ctx;
CUresult status = CUDA_SUCCESS;
CUctxMarker marker;
NvBool markerInitialized = NV_FALSE;
CU_TRACE_FUNCTION();
#if cuiIsEglSyncSupported()
if (event->type == CUI_EVENT_TYPE_EGL) {
status = cuiEglEventSynchronize(event);
goto Done;
}
#endif
if (cuiEventIsIpc(event)) {
NvBool shouldYield = cuiCtxShouldYieldInSpinLoops(ctx);
NvU32 lastIssuedPayload = *event->data.ipc.lastIssuedPayload;
while (1) {
NvU32 payload = CUI_MODS_MEM_RD32(&event->data.ipc.semaData->payload);
readBarrierIfWeakilyConsistent();
if (lastIssuedPayload <= payload) {
break;
}
if (shouldYield) {
cuosThreadYield();
}
cuosPauseInstruction();
}
return CUDA_SUCCESS;
}
else {
CUctxMarkerStatus markerStatus = CU_CTX_MARKER_METHODS_NOT_PUSHED;
CUIeventSyncPt *syncPt = cuiEventGetSyncPt(event);
NvBool useBlockingSync = cuiEventUsesBlockingSync(event);
CUctxMarkerWaitBehavior markerWaitBehaviour;
CU_ASSERT(syncPt);
ctxMarkerInitialize(&marker, ctx->channelManager);
markerInitialized = NV_TRUE;
#if cuiEventNvnSupported()
if (cuiEventIsNvn(event)) {
cuiNVNSyncDataUpdate(ctx, event);
}
#endif
cuiMutexLock(&syncPt->mutex);
{
if (uvmNodeIdSnapshot_out) {
*uvmNodeIdSnapshot_out = syncPt->uvmDagNodeId;
}
// query the event's marker
status = ctxMarkerGetStatus(&markerStatus, syncPt->marker, CU_CTX_MARKER_FLAGS_NONE);
if (CUDA_SUCCESS != status) {
CU_ERROR_PRINT(("Failed to get marker status!\n"));
}
// if the event isn't done yet, take a snapshot of its marker, so we can wait on the
// snapshot in this thread without holding the lock of the event (in case another thread
// wants to poll on the event while we sync on it)
if (CUDA_SUCCESS == status) {
if (CU_CTX_MARKER_COMPLETED_BY_GPU != markerStatus) {
status = ctxMarkerCopy(&marker, syncPt->marker);
if (CUDA_SUCCESS != status) {
CU_ERROR_PRINT(("Failed to copy the marker!\n"));
}
}
}
}
cuiMutexUnlock(&syncPt->mutex);
if (status != CUDA_SUCCESS) {
goto Done;
}
// wait on the snapshot
if (CU_CTX_MARKER_COMPLETED_BY_GPU != markerStatus) {
// For blocking sync we pushed AWAKEN during recording of event
if (useBlockingSync) {
markerWaitBehaviour = CU_CTX_MARKER_WAIT_BLOCKING_SYNC;
} else {
markerWaitBehaviour = ctxMarkerShouldYieldInSpinLoops(ctx, NULL);
}
status = ctxMarkerWait(&marker, markerWaitBehaviour);
}
// Give syscalls a chance to flush out any accumulated data
// while kernels can still be running asynchronously on the device.
cuiSyscallSwTrapHandler(ctx, CU_SYSCALL_SW_TRAP_ASYNCHRONOUS);
}
Done:
if (markerInitialized) {
ctxMarkerDeinitialize(&marker);
}
return status;
}
cuiEventNotifyUserSynchronized
CUresult
cuiEventNotifyUserSynchronized(CUevent pEvent, CUIuvmDagNodeId uvmNodeId)
{
CUresult status = CUDA_SUCCESS;
CUctx *ctx = pEvent->ctx;
CU_TRACE_FUNCTION();
CU_ASSERT(ctx);
// Since the event has completed, notify dmal
status = ctx->device->dmal.deviceMemoryChangeExternalAccess(ctx, NV_TRUE);
if (CUDA_SUCCESS != status) {
goto Done;
}
// Since the event has completed, notify UVM
status = cuiUvmDagSynchronizeNode(uvmNodeId);
if (CUDA_SUCCESS != status) {
goto Done;
}
Done:
return status;
}
1528

被折叠的 条评论
为什么被折叠?



