CUDA系列-Event-9

// create two events
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// record start event on the default stream
cudaEventRecord(start);
// execute kernel
kernel<<<grid, block>>>(arguments);
// record stop event on the default stream
cudaEventRecord(stop);
// wait until the stop event completes
cudaEventSynchronize(stop);
// calculate the elapsed time between two events
float time;
cudaEventElapsedTime(&time, start, stop);
// clean up the two events
cudaEventDestroy(start);
cudaEventDestroy(stop);

参考:
https://blog.csdn.net/k916631305/article/details/112802166


Static design

This section describes static aspects of the CUSW_UNIT_EVENT unit’s architecture.

Overview of software element architecture

CUSW_UNIT_EVENT implements CUDA event objects.

A CUDA event represents a position in a CUDA stream. Pointing at a particular position in stream is known as recording the event. Inserting a wait into a stream on a previously recorded event is known as waiting for the event. An event cannot be waited upon before it has been recorded.

The record and wait can happen on two different streams. This allows dependencies to be created between streams.

CUDA events can also represent synchronization objects from other APIs. For such events the record does not happen on a CUDA stream. It happens when the other API sets up triggers for its corresponding synchronization object. The wait for such events can be inserted into CUDA streams as usual. These events thus allow efficient synchronization between CUDA and other APIs for interoperation.

CUDA events can do cross-process synchronization. In that case the record happens in one process while the wait happens in another process. This is achieved using inter-process communication (IPC).

The diagram below shows how CUSW_UNIT_EVENT relates to other units.
在这里插入图片描述

Abstractions

The CUevent struct represents a CUDA event. It wraps an underlying synchronization primitive along with flags and data to achieve record and wait operations.

A single-process event uses a marker object as its synchronization primitive.

When the event is recorded or created from a synchronization object provided by a non-CUDA API, a marker entry of appropriate type is added to the marker. See CUSW_UNIT_SYNC unit architecture for detailed description of marker and marker entries.

Interprocess events use shared semaphore memory instead of a marker object.

CUSW_UNIT_EVENT unit provides interface functions to * Create an event object. * Record an event into a stream. * Synchronize with the event i.e. wait till the event is marked complete and then return. * Query the event for its status — completed, not completed, error. * Destroy an event object and release resources allocated for the event.

Each function determines the type of underlying synchronization primitive and calls appropriate methods related to that primitive.

The following table captures the set of units that CUISYNC is dependent on, the functionality that it depends on those units for, and shared data (globals, configurations, etc), if any.
在这里插入图片描述

Code

定义

/**
 * \brief Information about a synchronization point recorded by an event.
 * Not used for IPC events.
 */
typedef struct CUIeventSyncPt_st {
    //! protects all fields
    CUImutex mutex;

    //! the last recorded synchronization point
    CUctxMarker* marker;

    //! UVM DAG node for the last recorded synchronization point
    CUIuvmDagNodeId uvmDagNodeId;
} CUIeventSyncPt;

/**
 * \brief Possible types of events
 */
enum CUIeventType {
    //! Standard CUDA event, recorded and consumed locally
    CUI_EVENT_TYPE_LOCAL,
    //! CUDA IPC event, created locally
    CUI_EVENT_TYPE_IPC_LOCAL,
    //! CUDA IPC event imported from another process
    CUI_EVENT_TYPE_IPC_REMOTE
#if cuiIsEglSyncSupported()
    ,
    //! CUDA event for EGL interop
    CUI_EVENT_TYPE_EGL
#endif
#if cuiEventNvnSupported()
    ,
    //! CUDA event for NVN interop
    CUI_EVENT_TYPE_NVN
#endif
};


/**
 * Possible states for an event. Currently only used for
 * non-IPC, non-interop events.
 */
enum CUIlocalEventState {
    //! The event does not currently represent any synchronization point.
    CUI_EVENT_EMPTY = 0,
    //! The event contains a synchronization point recorded in a
    //! non-capturing stream.
    CUI_EVENT_RECORDED,
    //! The event contains a synchronization point recorded in a
    //! stream that is being captured into a cuda graph.
    CUI_EVENT_CAPTURED,
    //! The event is currently invalid and can only be rerecorded or
    //! destroyed. This is used for captured events once the containing
    //! graph goes away (e.g. due to ending capture). This is distinct
    //! from EMPTY, which represents a valid event with an empty set of
    //! things to synchronize.
    CUI_EVENT_INVALID
};

struct CUevent_st {
    //! owning context
    CUctx *ctx;
    //! linked list of all events belonging to the context
    CUevent prev, next;

    //! Id is unique per process for the process's lifetime
    NvU64 eventID;

    //! the type of this event
    enum CUIeventType type;

    //! type-specific data
    union {
        //! data for CUI_EVENT_TYPE_LOCAL
        struct {
            //! is this event capturing timing information?
            NvBool recordTimingData;
            //! does this event use blocking sync
            NvBool useBlockingSync;
            //! semaphore used for this event's timestamp (only allocated if recordTimingData is true)
            CUsema *sema;
            //! current status of this event
            enum CUIlocalEventState state;
            //! the last recorded synchronization point, if isRecorded is true
            CUIeventSyncPt syncPt;
            //! stream capture data
            //! All fields except graph are protected as part of the capture using
            //! cuiGraphLockForCapture(capture.graph).
            struct {
                //! the graph the event was recorded in
                CUIgraph *graph;
                //! the set of nodes in the graph this event waits on
                cuiContigHashSet deps;
                //! the previous event captured in the same graph
                CUevent prev;
                //! the next event captured in the same graph
                CUevent next;
            } capture;
        } local;
        //! data for CUI_EVENT_TYPE_IPC_LOCAL and CUI_EVENT_TYPE_IPC_REMOTE
        struct {
            //! the pool the semaphore is allocated from
            CUIipcSemaphorePool *ipcPool;
            //! information about the semaphore released by this event
            volatile CUsemaDataFourWords *semaData;
            // TODO do IPC events follow standard re-record semantics???
            //! the payload of the last release
            volatile NvU32 *lastIssuedPayload;
            //! these fields are for CUI_EVENT_TYPE_IPC_LOCAL only
            struct {
                //! does this event use blocking sync
                NvBool useBlockingSync;
                //! the semaphore released by this event
                CUsema *sema;
            } local;
            // IPC events are just always assumed to have been recorded, so no isRecorded field.
        } ipc;
#if cuiIsEglSyncSupported()
        //! data for CUI_EVENT_TYPE_EGL
        struct {
            //! does this event use blocking sync
            NvBool useBlockingSync;
            //! the EGLSync handle from which the event is created
            EGLSyncKHR eglSync;
            //! EGL state
            EGLGlobalState *globalEglState;
        } egl;
#endif
#if cuiEventNvnSupported()
        //! data for CUI_EVENT_TYPE_NVN
        struct {
            //! does this event use blocking sync
            NvBool useBlockingSync;
            //! the NVN event to wait on
            CUINVNSyncData *syncData;
            //! the synchronization point represented by this event
            CUIeventSyncPt syncPt;
        } nvn;
#endif
    } data;
};

Function

/**
 * \brief Init Check for Events
 * 
 * \detailDescription cuiEventInitCheck function checks whether the hEvent is valid and 
 * also checks if there are any errors on the context.
 *
 * \param[in] hEvent - the event on which to operate
 * \param[in] flags - optional flags include
 *                    CUI_EVENT_INIT_CHECK_ALLOW_CAPTURED,
 *                    CUI_EVENT_INIT_CHECK_ALLOW_INVALIDATED, and
 *                    CUI_EVENT_INIT_CHECK_DONT_INVALIDATE_IF_CAPTURED
 *
 * \return CUresult - the result of the function.
 * 
 * \retval
 * CUDA_SUCCESS,
 * CUDA_ERROR_INVALID_HANDLE,
 * CUDA_ERROR_INVALID_CONTEXT,
 * CUDA_ERROR_CONTEXT_IS_DESTROYED
 *
 * \additionalNotes
 * cuiEventInitCheck function returns CUDA_ERROR_INVALID_HANDLE, if the hEvent is NULL.
 * 
 * \implDetails
 * cuiEventCheck function checks whether the hEvent is valid. It then calls the function 
 * cuiInitCheckCtx which returns error if there are any sticky errors on the context.
 * 
 * \endfn 
 */

CUDA_TEST_EXPORT CUresult cuiEventInitCheck(CUevent hEvent, unsigned flags);
/**
 * \brief Create an Event 
 * 
 * \detailDescription  cuiEventCreate function allocates and initializes the cuda 
 * event based on the flags. This function expects that the ctx is valid.
 *
 *
 * \param[in] ctx - the context
 * \param[in] Flags - the event flag creation parameters 
 * \param[out] ppEvent - the event
 *
 * \return CUresult - the result of the function.
 *
 * \retval 
 * CUDA_SUCCESS,
 * CUDA_ERROR_OUT_OF_MEMORY
 *
 * \additionalNotes
 * cuiEventCreate function allocates pEvent->sema from pEvent->ipcPool, if the 
 * pEvent->isIpc is true.
 * \additionalNotes
 * cuiEventCreate function sets the field pEvent->didRecord to true for ipc events.
 * \additionalNotes
 * For events which record timing data, cuiEventCreate function allocates the semaphore 
 * pEvent->sema from ctx->semaphores and sets the payload with CU_EVENT_PAYLOAD_DELIVERED.
 *
 * \implDetails cuiEventCreate function allocates memory for the cuda event. It initializes
 * event with the default values and other fields depending upon the Flags passed. For non
 * ipc events, it allocates a marker pEvent->syncPt.marker. For ipc events, it allocates 
 * the sema and sets the last issued payload. It then 
 * sets pEvent->eventID with a unique counter and adds the event to the event list.
 *
 * \endfn
 *
 */
CUDA_TEST_EXPORT CUresult cuiEventCreate(CUctx *ctx, CUevent *ppEvent, unsigned int Flags);
/**
 * \brief Create an ipc Event
 *
 * \detailDescription This function allocates memory for the ipc event and initializes it. 
 *
 * \param[in] pool - the ipc semaphore pool
 * \param[in] eventHandle - the ipc event handle
 * \param[out] ppEvent - the event to be returned.
 * 
 * \return CUresult - the result of the function
 *
 * \retval
 * CUDA_SUCCESS,
 * CUDA_ERROR_OUT_OF_MEMORY,
 *
 * \implDetails This function allocates the event, initializes it. It sets the pEvent->didRecord
 * to true. It calls the function cuiIpcGetLastIssuedPayloadPtr and sets the 
 * pEvent->lastIssuedPayload. It also sets the event's semaData. It adds the event to context
 * list.
 *
 * \endfn
 */ 

CUresult cuiEventIpcOpen(CUIipcSemaphorePool *pool, CUIipcEventHandle *eventHandle, CUevent *ppEvent);
/**
 * \brief Record the event
 *
 * \detailDescription cuiEventRecord function records the event on the stream. This 
 * function expects the ctx to be valid.
 * 
 * \param[in] ctx - the event's context
 * \param[in] pEvent - the event on which to operate
 * \param[in] pStream - the stream on which to record.
 *
 * \return CUresult - the result of the function 
 *
 * \retval
 * CUDA_SUCCESS
 * CUDA_ERROR_OUT_OF_MEMORY, (cuiStreamGetDependencyMarkerOfNextWork)
 * CUDA_ERROR_LAUNCH_FAILED,
 * CUDA_ERROR_LAUNCH_TIMEOUT,
 * CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
 * CUDA_ERROR_ILLEGAL_ADDRESS,
 * CUDA_ERROR_ECC_UNCORRECTABLE,
 * CUDA_ERROR_MAP_FAILED,
 * CUDA_ERROR_UNMAP_FAILED,
 * CUDA_ERROR_NOT_INITIALIZED
 * CUDA_ERROR_DEINITIALIZED,
 * 
 * \additionalNotes
 * For ipc event, cuiEventRecord function does a semaphore Acquire on the address 
 * devVaddr got from cuiIpcGetDevVaddr and payload value event->lastIssuedPayload+1 before
 * doing a semaphore Release for this event
 *
 *
 * \implDetails cuiEventRecord function first gets the channel on which we will push 
 * methods. 
 *
 * For mps client and ipc event,  it calls the  semaphoreIncrementAndReleaseMPS 
 * For ipc events, it starts a new push on the channel, does a semaphore acquire on the 
 * event->semaData and and release the semaphore on compute or memcpy depending on the 
 * channel type.
 * For Blocking sync events, the streamBeginpush is called, then the channelPushBlockingSyncAwaken 
 * which pushes a blocking sync and it ends the push by calling the streamEndPush.
 * For record timing data events, it does a streamBeginpush. It calls the semaphoreReleaseAsyncMemcpy or semaphoreReleaseCompute depending on the channel type. It calls the streamEndPush
 * to end the push.
 * For default events, it checks whether the stream is null and calls the streamManagerUpdateNullStreamBarrier function.
 * 
 * For all  non-ipc event, it pushes the  event->syncpt node in the uvm graph. 
 * It copies the marker to track the completion  of work and sets the event->didRecord to true.
 * 
 * \endfn
 */
CUDA_TEST_EXPORT CUresult cuiEventRecord(CUctx *ctx, CUevent pEvent, CUIstream *pStream);
/**
 * \brief Destroy the event
 *
 * \detailDescription cuiEventDestroy function destroys or frees all the internal of the 
 * event i.e marker, ipc semaphore, semaphore for timing, nvrm syync for egl sync and 
 * destroys the event.
 *
 * \param[in] pEvent - the event to be destroyed.
 * \additionalNotes
 * cuiEventDestroy function frees the memory for the pEvent.
 * \additionalNotes 
 * Abandon the semaphore if the event is ipc and the isIpcRemote is false.
 * 
 * \implDetails  cuiEventDestoy function removes the event from the list of event in the
 *  context. If the event is created from eglSync, then it calls the cuiEglStateDestroy. If the 
 *  event is non-ipc and not used for eglSync, then it destroys the event's marker 
 *  pEvent->syncPt.marker through ctxMarkerDestroy and abandons the semaphore pEvent->sema. 
 *  It then frees the pEvent.
 *
 * \endfn
 */
CUDA_TEST_EXPORT CUresult cuiEventDestroy(CUevent pEvent);
/**
 *
 * \brief Query the event
 *
 * \detailDescription cuiEventQuery_r  function queries for the current status of the 
 * event. It returns whether it is ready or not.
 *
 * \param[in] pEvent - The event to be synchronized.
 * \param[out] uvmNodeIdSnapshot_out - The uvm dag node id which shall be returned
 *
 * \return CUresult - the result of the function
 *
 * \retval 
 * CUDA_SUCCESS,
 * CUDA_ERROR_NOT_READY
 *
 * \additionalNotes
 * cuiEventQuery_r function takes the lock syncPt.mutex before doing any read operation on the event->syncPt
 * except for events created from eglSync.
 * \additionalNotes
 * For events created from eglSync, the EGL export API is called to get the latest nvrmSync object.
 * This operation results in dynamic allocation which may impact the performance.
 * It would get fixed in future releases. It queries the nvrmSync object and closes it.
 * \additionalNotes
 * For ipc events, Returns CUDA_ERROR_NOT_READY, if the event's lastIssuedPayload is not equal to event’s payload.
 * \additionalNotes
 * For non-ipc events, the function returns CUDA_ERROR_NOT_READY when the markerStatus is not 
 * equal to CU_CTX_MARKER_COMPLETED_BY_GPU.
 *
 * \implDetails
 * cuiEventQuery_r function checks whether isIpc is true or not. For non-ipc event and not used for eglSync,
 * it updates the uvmNodeIdSnapshot_out with the event->syncPt.uvmDagNodeId. 
 * According to the status of markerStatus, it returns CUDA_ERRRO_NOT_READY or CUDA_SUCCESS.
 *
 * \endfn
 */

CUDA_TEST_EXPORT CUresult cuiEventQuery_r(CUevent pEvent, NvU64 *uvmNodeIdSnapshot_out);
/**
 * \brief Synchronize the event
 *
 * \detailDescription cuiEventSynchronize_r function blocks till the event has completed. If the event is ipc, then
 *  it spin loop till the event's payload is greater than the last issued payload. If the event is non ipc,
 *  then it blocks till the event->syncPt.marker’s marker status is CU_CTX_MARKER_COMPLETED_BY_GPU.
 *
 * \param[in] pEvent -  The event to be synchronized
 * \param[out] uvmNodeIdSnapshot_out  The uvm dag node id in uvm dag which shall be returned
 *
 * \return CUresult - the result of the function 
 *
 * \retval
 * CUDA_SUCCESS
 * CUDA_ERROR_OUT_OF_MEMORY, (ctxMarkerCopy)
 * CUDA_ERROR_LAUNCH_FAILED, (ctxMarkerWait,ctxMarkerGetStatus)
 * CUDA_ERROR_LAUNCH_TIMEOUT,
 * CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES,
 * CUDA_ERROR_ILLEGAL_ADDRESS,
 * CUDA_ERROR_ECC_UNCORRECTABLE,
 * CUDA_ERROR_MAP_FAILED,
 * CUDA_ERROR_UNMAP_FAILED,
 * CUDA_ERROR_NOT_INITIALIZED
 * CUDA_ERROR_DEINITIALIZED,
 * CUDA_ERROR_UNKNOWN.
 *
 * \implDetails cuiEventSynchronize_r function first checks whether the event is 
 * created from eglSync. Such events do not have marker associated with it.
 * This function calls into EGL's export API to get the latest nvrmSync object.
 * It then performs wait operation depending on the flag 'useBlokingSync' set at the
 * time of event creation and closes the nvrmSync object.
 * Next the function checks whether the event is interprocess.
 * For ipc events it loops till the event's payload 
 * event->semaData->payload is equal to greater than the lastIssuedPayload. For non ipc 
 * events and not used for eglsync, it first allocates a local marker variable, then  updates the 
 * uvmNodeIdSnapshot_out with uvmDagnodeId of the event's syncpoint and gets the 
 * markerStatus, if the markerStatus is not equal to  CU_CTX_MARKER_COMPLETED_BY_GPU, then 
 * it updates the local marker with event->syncPt.marker and does a wait on the local 
 * marker. It then de-initializes the marker and returns the status.
 *
 *
 * \endfn
 */
CUDA_TEST_EXPORT CUresult cuiEventSynchronize_r(CUevent pEvent, NvU64 *uvmNodeIdSnapshot_out);
/**
 * \brief Notify User Synchronized 
 *
 * \detailDescription cuiEventNotifyUserSynchornized function notifies the uvm that the 
 * event has completed. It also changes the memory object ownership.
 * 
 * \param[in] pEvent - the event to operated on.
 * \param[in] uvmNodeId - the uvm node id which has to be updated.
 *
 * \return
 * CUresult - the result of the function.
 *
 * \retval
 * CUDA_SUCCESS
 * 
 * \retval CUDA_ERROR_OUT_OF_MEMORY,(cuiUvmDagSynchronizeNode)
 * \retval CUDA_ERROR_OPERATING_SYSTEM,
 * \retval CUDA_ERROR_UNKNOWN
 *
 * \implDetails cuiEventNotifyUserSynchronized function calls the dmal function 
 * deviceMemoryChaneExternalAcccess which changes the memory object ownership and flushes 
 * the cache. It then calls the functions cuiUvmDagSynchronizeNode which updates the uvm 
 * dag that the event has completed
 *
 * \endfn
 */
CUresult cuiEventNotifyUserSynchronized(CUevent pEvent, CUIuvmDagNodeId uvmNodeId);

/**
 * \brief Invalidate a captured event
 *
 * \detailDescription cuiEventInvalidateCaptured marks a captured event invalid, clears
 * its capture state, and removes it from the capturing graph's event list. It is an
 * error to call this on an event that is not type CUI_EVENT_TYPE_LOCAL with state
 * CUI_EVENT_CAPTURED.
 *
 * \param[in] pEvent - the event to invalidate
 *
 * \endfn
 */
void cuiEventInvalidateCaptured(CUevent pEvent);

create详细

CUresult
cuiEventCreate(CUctx *ctx, CUevent *ppEvent, unsigned int Flags)
{
    CUresult status = CUDA_SUCCESS;
    CUevent pEvent = NULL;
    NvBool mutexInitialized = NV_FALSE;

    CU_ASSERT(ctx);
    CU_TRACE_FUNCTION();

    pEvent = (CUevent)malloc(sizeof(*pEvent));
    if (!pEvent) {
        CU_DEBUG_PRINT(("Memory allocation failed\n"));
        return CUDA_ERROR_OUT_OF_MEMORY;
    }
    cuosMemset(pEvent, 0, sizeof(*pEvent));
    pEvent->ctx = ctx;

    if (!(Flags & CU_EVENT_INTERPROCESS)) {
        pEvent->type = CUI_EVENT_TYPE_LOCAL;
        pEvent->data.local.useBlockingSync = (Flags & CU_EVENT_BLOCKING_SYNC) != 0;
        pEvent->data.local.recordTimingData = (Flags & CU_EVENT_DISABLE_TIMING) == 0;

        status = ctxMarkerCreate(&pEvent->data.local.syncPt.marker, ctx->channelManager);
        if (CUDA_SUCCESS != status) {
            goto Error;
        }

        cuiMutexInitialize(&pEvent->data.local.syncPt.mutex, CUI_MUTEX_ORDER_CU_EVENT, CUI_MUTEX_DEFAULT);
        mutexInitialized = NV_TRUE;

        if (pEvent->data.local.recordTimingData) {
            status = semaphoreAlloc(ctx->semaphores, &pEvent->data.local.sema);
            if (CUDA_SUCCESS != status) {
                goto Error;
            }
            // initialize as delivered so that the first get-fresh-storage call is a NOP
            semaphoreSetPayload(pEvent->data.local.sema, CU_EVENT_PAYLOAD_DELIVERED);
        }

        cuiContigHashSetInit(&pEvent->data.local.capture.deps, sizeof(CUIgraphNode *), 0);
    }
    else {
        pEvent->type = CUI_EVENT_TYPE_IPC_LOCAL;
        pEvent->data.ipc.local.useBlockingSync = (Flags & CU_EVENT_BLOCKING_SYNC) != 0;

        status = cuiIpcGetLocalPool(ctx, &pEvent->data.ipc.ipcPool);
        if (CUDA_SUCCESS != status) {
            CU_DEBUG_PRINT(("Failed to create/get local IPC semaphore pool\n"));
            goto Error;
        }

        // Get a semaphore from our special pool
        status = semaphoreAlloc(pEvent->data.ipc.ipcPool->semaphores, &pEvent->data.ipc.local.sema);
        if (CUDA_SUCCESS != status) {
            goto Error;
        }
        
        // Store the shared memory area for host side coordination
        pEvent->data.ipc.lastIssuedPayload = cuiIpcGetLastIssuedPayloadPtr(
            pEvent->data.ipc.ipcPool,
            semaphoreGetIndex(pEvent->data.ipc.local.sema));
        
        // we'll never need new storage, so store our device Vaddr for later use
        pEvent->data.ipc.semaData = semaphoreGetSemaDataFourWords(pEvent->data.ipc.local.sema);
        
        // Initialize our payload (this is also the last issued payload)
        *pEvent->data.ipc.lastIssuedPayload = CU_EVENT_PAYLOAD_INITIALIZED;
        pEvent->data.ipc.semaData->payload = CU_EVENT_PAYLOAD_INITIALIZED;
    }

    pEvent->eventID = cuosInterlockedIncrement64(&globals.eventIdCounter);

    // add to the context list
    CUDA_LIST_INSERT(ctx->pEventList, pEvent);

    toolsNotifyEventCreated(pEvent, ctx);

    *ppEvent = pEvent;
    return CUDA_SUCCESS;

Error:

    if (pEvent) {
        switch (pEvent->type) {
            case CUI_EVENT_TYPE_LOCAL:
                if (mutexInitialized) {
                    cuiMutexDeinitialize(&pEvent->data.local.syncPt.mutex);
                }
                if (pEvent->data.local.syncPt.marker) {
                    ctxMarkerDestroy(pEvent->data.local.syncPt.marker);
                    pEvent->data.local.syncPt.marker = NULL;
                }
                if (pEvent->data.local.sema) {
                    semaphoreFree(pEvent->data.local.sema);
                    pEvent->data.local.sema = NULL;
                }
                break;
            case CUI_EVENT_TYPE_IPC_LOCAL:
                if (pEvent->data.ipc.local.sema) {
                    semaphoreFree(pEvent->data.ipc.local.sema);
                    pEvent->data.ipc.local.sema = NULL;
                }
                break;
            default:
                CU_ASSERT(0);
                break;
        }
        free(pEvent);
    }
    return status;
}

cuiEventRecord

调用栈如下:

cuiEventRecord
	cuiStreamGetChannelOfLastWork
	streamBeginPushOnChannel
	streamEndPush
	

详细代码如下:


CUresult
cuiEventRecord(CUctx *ctx, CUevent event, CUIstream *stream)
{
    CUresult status = CUDA_SUCCESS;
    CUnvchannel* channel = NULL;
    NvBool needsPush = cuiEventSupportsTiming(event) ||
                       cuiEventIsIpc(event) ||
                       (cuiEventUsesBlockingSync(event) &&
                        !ctx->channelManager->usesNvRmSyncForBlockingWait) ||
                       cuiEventIsNvn(event);

    CU_TRACE_FUNCTION();
    CU_ASSERT(ctx);
    CU_ASSERT(!stream->isBarrierStream);
    CU_ASSERT(cuiEventCanRecord(event));

    if (cuiStreamIsCapturing(stream)) {
        CU_ASSERT(event->type == CUI_EVENT_TYPE_LOCAL);
        if (event->data.local.state == CUI_EVENT_CAPTURED) {
            // Note that this is potentially a different lock than the one taken
            // immediately after the if, in the case the event is being rerecorded
            // to a new graph.
            cuiGraphLockForCapture(event->data.local.capture.graph);
            CUDA_LIST_REMOVE_PREVNEXT(event->data.local.capture.graph->capture.eventsHead,
                    event, data.local.capture.prev, data.local.capture.next);
            cuiGraphUnlockForCapture(event->data.local.capture.graph);
            event->data.local.capture.graph = NULL;
        }
        cuiGraphLockForCapture(stream->capture.graph);
        status = cuiContigHashSetClearAndInsertAll(&event->data.local.capture.deps, &stream->capture.nextPushDeps);
        if (status == CUDA_SUCCESS) {
            CUIgraph *graph = stream->capture.graph;
            event->data.local.state = CUI_EVENT_CAPTURED;
            event->data.local.capture.graph = graph;
            CUDA_LIST_INSERT_PREVNEXT(graph->capture.eventsHead, event, data.local.capture.prev, data.local.capture.next);
        }
        else {
            event->data.local.state = CUI_EVENT_INVALID;
        }
        cuiGraphUnlockForCapture(stream->capture.graph);
        return status;
    }
    else if (event->type == CUI_EVENT_TYPE_LOCAL && event->data.local.state == CUI_EVENT_CAPTURED) {
        cuiGraphLockForCapture(event->data.local.capture.graph);
        CUDA_LIST_REMOVE_PREVNEXT(event->data.local.capture.graph->capture.eventsHead,
                event, data.local.capture.prev, data.local.capture.next);
        cuiGraphUnlockForCapture(event->data.local.capture.graph);
        event->data.local.capture.graph = NULL;
        cuiContigHashSetClear(&event->data.local.capture.deps);
        event->data.local.state = CUI_EVENT_EMPTY;
    }

    if (needsPush) {
        // Select the channel on which we'll push methods (if needed).
        // Start with the channel of last work to avoid impacting compute/memcpy overlap.
        channel = cuiStreamGetChannelOfLastWork(stream);

        // If no work has been submitted in the stream yet, use the channel manager to
        // select a compute channel.
        if (!channel) {
            channel = channelManagerSelectChannelForPush(stream->ctx->channelManager, CU_CHANNEL_COMPUTE, stream);
        }
    }

    // IPC semaphore releases must be handled on the server
    if (cuiGlobalsIsLegacyMpsClient() && cuiEventIsIpc(event)) {
        CU_ASSERT(needsPush);
        CUmemobj *memobj = event->data.ipc.ipcPool->memobj;
        semaphoreIncrementAndReleaseMPS(
            channel, 
            stream,
            memobjGetMemblock(memobj),
            PTR2UINT(event->data.ipc.semaData) - PTR2UINT(memobjGetBlockHostPtr(memobj)),
            PTR2UINT(event->data.ipc.lastIssuedPayload) - PTR2UINT(memobjGetBlockHostPtr(memobj)));
        return CUDA_SUCCESS;
    }

    // push the semaphore and/or interrupt if needed
    if (needsPush) {
        CUnvCurrent* nvCurrent = NULL;
        CUmemTrackListNode emptyNode = memTrackListNodeCreate(NULL, 0);
        CUmemTrackList     emptyList = memTrackListCreate(&emptyNode);
        // If timing is supported, this is also the timing sema
        CUsema *eventSema = cuiEventGetSema(event);

        // get a new backing offset and initialize its data
        if (cuiEventSupportsTiming(event)) {
            CU_ASSERT(eventSema);
            semaphoreRefreshStorage(eventSema, CU_EVENT_PAYLOAD_DELIVERED);
            semaphoreSetPayload(eventSema, CU_EVENT_PAYLOAD_INITIALIZED);
        }

        streamBeginPushOnChannel(channel, stream, &nvCurrent);
        // push the semaphore release
        if (cuiEventSupportsTiming(event) || cuiEventIsIpc(event)) {
            NvU64 devVaddr = 0;
            NvU32 payload = CU_EVENT_PAYLOAD_DELIVERED;
            if (cuiEventIsIpc(event)) {
                devVaddr = cuiIpcGetDevVaddr(ctx, event->data.ipc.ipcPool, PTR2UINT(event->data.ipc.semaData));
                payload = (NvU32) cuosInterlockedIncrement((volatile unsigned int *) event->data.ipc.lastIssuedPayload);
                
                // There is a race between getNextPayload and pushing the semaphoreRelease
                // push a semaphoreAcquire so we don't outrun a release in a different process
                nvCurrent = ctx->device->hal.semaphoreAcquire(
                    nvCurrent,
                    0, 0,
                    ctx,
                    devVaddr,
                    NvOffset_LO32(payload - 1),
                    CUI_SEMAPHORE_ACQUIRE_FLAGS_NONE);
            }
            else {
                devVaddr = semaphoreGetOffset(eventSema);
            }
            if (channelIsCompute(channel)) {
                nvCurrent = ctx->device->hal.semaphoreReleaseCompute(
                    nvCurrent, 
                    devVaddr, 
                    payload,
                    0);
            }
            else {
                nvCurrent = ctx->device->hal.semaphoreReleaseAsyncMemcpy(
                    nvCurrent, 
                    devVaddr, 
                    payload,
                    0);
            }
        }
        #if cuiEventNvnSupported()
        if (event->type == CUI_EVENT_TYPE_NVN) {
           nvCurrent = cuiNVNSemaphoreRelease(channel,nvCurrent, event);
        }
        #endif
        // push the interrupt generating method
        if (cuiEventUsesBlockingSync(event) &&
            !ctx->channelManager->usesNvRmSyncForBlockingWait)
        {
            channelPushBlockingSyncAwaken(channel, &nvCurrent);
        }
        streamEndPush(stream, nvCurrent, emptyList);
    }
    else {
        // make sure that event records in the NULL stream act as barriers,
        // even if the event record didn't do any work
        if (stream->isNullStream) { 
            cuiStreamLock(stream);
            status = streamManagerUpdateNullStreamBarrier(ctx->streamManager);
            cuiStreamUnlock(stream);
            if (status != CUDA_SUCCESS) {
                return status;
            }
        }

    }

    // interprocess events don't need (and can't use) a marker
    if (!cuiEventIsIpc(event)) {
        CUIuvmDagNodeId oldNodeId;
        CUIeventSyncPt *syncPt = cuiEventGetSyncPt(event);

        CU_ASSERT(syncPt);

        cuiMutexLock(&syncPt->mutex);
        oldNodeId = syncPt->uvmDagNodeId;
        // push a node in the UVM graph
        status = cuiUvmDagEventRecord(stream, &syncPt->uvmDagNodeId);
        if (status == CUDA_SUCCESS) {
            // copy the marker to track completion of the work
            status = cuiStreamGetDependencyMarkerOfNextWork(syncPt->marker, stream);
            if (status != CUDA_SUCCESS) {
                // keep sync point update transactional
                syncPt->uvmDagNodeId = oldNodeId;
            }
        }
        cuiMutexUnlock(&syncPt->mutex);

        if (status != CUDA_SUCCESS) {
            goto Error;
        }
    }

    if (event->type == CUI_EVENT_TYPE_LOCAL) {
        event->data.local.state = CUI_EVENT_RECORDED;
    }

Error:

    return status;
}

cuiEventSynchronize_r


CUresult
cuiEventSynchronize_r(CUevent event, NvU64 *uvmNodeIdSnapshot_out)
{
    CUctx *ctx = event->ctx;
    CUresult status = CUDA_SUCCESS;
    CUctxMarker marker;
    NvBool markerInitialized = NV_FALSE;

    CU_TRACE_FUNCTION();

#if cuiIsEglSyncSupported()
    if (event->type == CUI_EVENT_TYPE_EGL) {
        status = cuiEglEventSynchronize(event);
        goto Done;
    }
#endif

    if (cuiEventIsIpc(event)) {
        NvBool shouldYield = cuiCtxShouldYieldInSpinLoops(ctx);
        NvU32 lastIssuedPayload = *event->data.ipc.lastIssuedPayload;
        while (1) {
            NvU32 payload = CUI_MODS_MEM_RD32(&event->data.ipc.semaData->payload);
            readBarrierIfWeakilyConsistent();
            if (lastIssuedPayload <= payload) {
                break;
            }
            if (shouldYield) {
                cuosThreadYield();
            }
            cuosPauseInstruction();
        }
        return CUDA_SUCCESS;
    }
    else {
        CUctxMarkerStatus markerStatus = CU_CTX_MARKER_METHODS_NOT_PUSHED;
        CUIeventSyncPt *syncPt = cuiEventGetSyncPt(event);
        NvBool useBlockingSync = cuiEventUsesBlockingSync(event);
        CUctxMarkerWaitBehavior markerWaitBehaviour;

        CU_ASSERT(syncPt);

        ctxMarkerInitialize(&marker, ctx->channelManager);
        markerInitialized = NV_TRUE;

        #if cuiEventNvnSupported()
        if (cuiEventIsNvn(event)) {
            cuiNVNSyncDataUpdate(ctx, event);
        }
        #endif

        cuiMutexLock(&syncPt->mutex);
        {
            if (uvmNodeIdSnapshot_out) {
                *uvmNodeIdSnapshot_out = syncPt->uvmDagNodeId;
            }

            // query the event's marker
            status = ctxMarkerGetStatus(&markerStatus, syncPt->marker, CU_CTX_MARKER_FLAGS_NONE);
            if (CUDA_SUCCESS != status) {
                CU_ERROR_PRINT(("Failed to get marker status!\n"));
            }

            // if the event isn't done yet, take a snapshot of its marker, so we can wait on the
            // snapshot in this thread without holding the lock of the event (in case another thread
            // wants to poll on the event while we sync on it)
            if (CUDA_SUCCESS == status) {
                if (CU_CTX_MARKER_COMPLETED_BY_GPU != markerStatus) {
                    status = ctxMarkerCopy(&marker, syncPt->marker);
                    if (CUDA_SUCCESS != status) {
                        CU_ERROR_PRINT(("Failed to copy the marker!\n"));
                    }
                }
            }
        }
        cuiMutexUnlock(&syncPt->mutex);
        if (status != CUDA_SUCCESS) {
            goto Done;
        }

        // wait on the snapshot
        if (CU_CTX_MARKER_COMPLETED_BY_GPU != markerStatus) {
            // For blocking sync we pushed AWAKEN during recording of event
            if (useBlockingSync) {
                markerWaitBehaviour = CU_CTX_MARKER_WAIT_BLOCKING_SYNC;
            } else {
                markerWaitBehaviour = ctxMarkerShouldYieldInSpinLoops(ctx, NULL);
            }
            status = ctxMarkerWait(&marker, markerWaitBehaviour);
        }

        // Give syscalls a chance to flush out any accumulated data 
        // while kernels can still be running asynchronously on the device.
        cuiSyscallSwTrapHandler(ctx, CU_SYSCALL_SW_TRAP_ASYNCHRONOUS);
    }

Done:
    if (markerInitialized) {
        ctxMarkerDeinitialize(&marker);
    }
    return status;
}

cuiEventNotifyUserSynchronized

CUresult
cuiEventNotifyUserSynchronized(CUevent pEvent, CUIuvmDagNodeId uvmNodeId)
{
    CUresult status = CUDA_SUCCESS;
    CUctx *ctx = pEvent->ctx;

    CU_TRACE_FUNCTION();
    CU_ASSERT(ctx);

    // Since the event has completed, notify dmal
    status = ctx->device->dmal.deviceMemoryChangeExternalAccess(ctx, NV_TRUE);
    if (CUDA_SUCCESS != status) {
        goto Done;
    }
    // Since the event has completed, notify UVM
    status = cuiUvmDagSynchronizeNode(uvmNodeId);
    if (CUDA_SUCCESS != status) {
        goto Done;
    }

Done:
    return status;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值