这里写目录标题
// create two events
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
// record start event on the default stream
cudaEventRecord(start);
// execute kernel
kernel<<<grid, block>>>(arguments);
// record stop event on the default stream
cudaEventRecord(stop);
// wait until the stop event completes
cudaEventSynchronize(stop);
// calculate the elapsed time between two events
float time;
cudaEventElapsedTime(&time, start, stop);
// clean up the two events
cudaEventDestroy(start);
cudaEventDestroy(stop);
参考:
https://blog.csdn.net/k916631305/article/details/112802166
Static design
This section describes static aspects of the CUSW_UNIT_EVENT unit’s architecture.
Overview of software element architecture
CUSW_UNIT_EVENT implements CUDA event objects.
A CUDA event represents a position in a CUDA stream. Pointing at a particular position in stream is known as recording the event. Inserting a wait into a stream on a previously recorded event is known as waiting for the event. An event cannot be waited upon before it has been recorded.
The record and wait can happen on two different streams. This allows dependencies to be created between streams.
CUDA events can also represent synchronization objects from other APIs. For such events the record does not happen on a CUDA stream. It happens when the other API sets up triggers for its corresponding synchronization object. The wait for such events can be inserted into CUDA streams as usual. These events thus allow efficient synchronization between CUDA and other APIs for interoperation.
CUDA events can do cross-process synchronization. In that case the record happens in one process while the wait happens in another process. This is achieved using inter-process communication (IPC).
The diagram below shows how CUSW_UNIT_EVENT relates to other units.
Abstractions
The CUevent struct represents a CUDA event. It wraps an underlying synchronization primitive along with flags and data to achieve record and wait operations.
A single-process event uses a marker object as its synchronization primitive.
When the event is recorded or created from a synchronization object provided by a non-CUDA API, a marker entry of appropriate type is added to the marker. See CUSW_UNIT_SYNC unit architecture for detailed description of marker and marker entries.
Interprocess events use shared semaphore memory instead of a marker object.
CUSW_UNIT_EVENT unit provides interface functions to * Create an event object. * Record an event into a stream. * Synchronize with the event i.e. wait till the event is marked complete and then return. * Query the event for its status — completed, not completed, error. * Destroy an event object and release resources allocated for the event.
Each function determines the type of underlying synchronization primitive and calls appropriate methods related to that primitive.
The following table captures the set of units that CUISYNC is dependent on, the functionality that it depends on those units for, and shared data (globals, configurations, etc), if any.
Code
定义
/**
* \brief Information about a synchronization point recorded by an event.
* Not used for IPC events.
*/
typedef struct CUIeventSyncPt_st {
//! protects all fields
CUImutex mutex;
//! the last recorded synchronization point
CUctxMarker* marker;
//! UVM DAG node for the last recorded synchronization point
CUIuvmDagNodeId uvmDagNodeId;
} CUIeventSyncPt;
/**
* \brief Possible types of events
*/
enum CUIeventType {
//! Standard CUDA event, recorded and consumed locally
CUI_EVENT_TYPE_LOCAL,
//! CUDA IPC event, created locally
CUI_EVENT_TYPE_IPC_LOCAL,
//! CUDA IPC event imported from another process
CUI_EVENT_TYPE_IPC_REMOTE
#if cuiIsEglSyncSupported()
,
//! CUDA event for EGL interop
CUI_EVENT_TYPE_EGL
#endif
#if cuiEventNvnSupported()
,
//! CUDA event for NVN interop
CUI_EVENT_TYPE_NVN
#endif
};
/**
* Possible states for an event. Currently only used for
* non-IPC, non-interop events.
*/
enum CUIlocalEventState {
//! The event does not currently represent any synchronization point.
CUI_EVENT_EMPTY = 0,
//! The event contains a synchronization point recorded in a
//! non-capturing stream.
CUI_EVENT_RECORDED,
//! The event contains a synchronization point recorded in a
//! stream that is being captured into a cuda graph.
CUI_EVENT_CAPTURED,
//! The event is currently invalid and can only be rerecorded or
//! destroyed. This is used for captured events once the containing
//! graph goes away (e.g. due to ending capture). This is distinct
//! from EMPTY, which represents a valid event with an empty set of
//! things to synchronize.
CUI_EVENT_INVALID
};
struct CUevent_st {
//! owning context
CUctx *ctx;
//! linked list of all events belonging to the context
CUevent prev, next;
//! Id is unique per process for the process's lifetime
NvU64 eventID;
//! the type of this event
enum CUIeventType type;
//! type-specific data
union {
//! data for CUI_EVENT_TYPE_LOCAL
struct {
//! is this event capturing timing information?
NvBool recordTimingData;
//! does this event use blocking sync
NvBool useBlockingSync;
//! semaphore used for this event's timestamp (only allocated if recordTimingData is true)
CUsema *sema;
//! current status of this event
enum CUIlocalEventState state;
//! the last recorded synchronization point, if isRecorded is true
CUIeventSyncPt syncPt;
//! stream capture data
//! All fields except graph are protected as part of the capture using
//! cuiGraphLockForCapture(capture.graph).
struct {
//! the graph the event was recorded in
CUIgraph *graph;
//! the set of nodes in the graph this event waits on
cuiContigHashSet deps;
//! the previous event captured in the same graph
CUevent prev;
//! the next event captured in the same graph
CUevent next;
} capture;
} local;
//! data for CUI_EVENT_TYPE_IPC_LOCAL and CUI_EVENT_TYPE_IPC_REMOTE
struct {
//! the pool the semaphore is allocated from
CUIipcSemaphorePool *ipcPool;
//! information about the semaphore released by this event
volatile CUsemaDataFourWords *semaData;
// TODO do IPC events follow standard re-record semantics???
//! the payload of the last release
volatile NvU32 *lastIssuedPayload;
//! these fields are for CUI_EVENT_TYPE_IPC_LOCAL only
struct {
//! does this event use blocking sync
NvBool useBlockingSync;
//! the semaphore released by this event
CUsema *sema;
} local;
// IPC events are just always assumed to have been recorded, so no isRecorded field.
} ipc;
#if cuiIsEglSyncSupported()
//! data for CUI_EVENT_TYPE_EGL
struct {
//! does this event use blocking sync
NvBool useBlockingSync;
//! the EGLSync handle from which the event is created
EGLSyncKHR eglSync;
//! EGL state
EGLGlobalState *globalEglState;
} egl;
#endif
#if cuiEventNvnSupported()
//! data for CUI_EVENT_TYPE_NVN
struct {
//! does this event use blocking sync
NvBool useBlockingSync;
//! the NVN event to wait on
CUINVNSyncData *syncData;
//! the synchronization point represented by this event
CUIeventSyncPt syncPt;
} nvn;
#endif
} data;
};
Function
/**
* \brief Init Check for Events
*
* \detailDescription cuiEventInitCheck function checks whether the hEvent is valid and
* also checks if there are any errors on the context.
*
* \param[in] hEvent - the event on which to operate
* \param[in] flags - optional flags include
* CUI_EVENT_INIT_CHECK_ALLOW_CAPTURED,
* CUI_EVENT_INIT_CHECK_ALLOW_INVALIDATED, and
* CUI_EVENT_INIT_CHECK_DONT_INVALIDATE_IF_CAPTURED
*
* \return CUresult - the result of the function.
*
* \retval
* CUDA_SUCCESS,
* CUDA_ERROR_INVALID_HANDLE,
* CUDA_ERROR_INVALID_CONTEXT,
* CUDA_ERROR_CONTEXT_IS_DESTROYED
*
* \additionalNotes
* cuiEventInitCheck function returns CUDA_ERROR_INVALID_HANDLE, if the hEvent is NULL.
*
* \implDetails
* cuiEventCheck function checks whether the hEvent is valid. It then calls the function
* cuiInitCheckCtx which returns error if there are any sticky errors on the context.
*
* \endfn
*/
CUDA_TEST_EXPORT CUresult cuiEventInitCheck(CUevent hEvent, unsigned flags);
/**
* \brief Create an Event
*
* \detailDescription cuiEventCreate function allocates and initializes the cuda
* event based on the flags. This function expects that the ctx is valid.
*
*
* \param[in] ctx - the context
* \param[in] Flags - the event flag creation parameters
* \param[out] ppEvent - the event
*
* \return CUresult - the result of the function.
*
* \retval
* CUDA_SUCCESS,
* CUDA_ERROR_OUT_OF_MEMORY
*
* \additionalNotes
* cuiEventCreate function allocates pEvent->sema from pEvent->ipcPool, if the
* pEvent->isIpc is true.
* \additionalNotes
* cuiEventCreate function sets the field pEvent->didRecord to true for ipc events.
* \additionalNotes
* For events which record timing data, cuiEventCreate function allocates the semaphore
* pEvent->sema from ctx->semaphores and sets the payload with CU_EVENT_PAYLOAD_DELIVERED.
*
* \implDetails cuiEventCreate function allocates memory for the cuda event. It initializes
* event with the default values and other fields depending upon the Flags passed. For non
* ipc events, it allocates a marker pEvent->syncPt.marker. For ipc events, it allocates
* the sema and sets the last issued payload. It then
* sets pEvent->eventID with a unique counter and adds the event to the event list.
*
* \endfn
*
*/
CUDA_TEST_EXPORT CUresult cuiEventCreate(CUctx *ctx, CUevent *ppEvent, unsigned int Flags);
/**
* \brief Create an ipc Event
*
* \detailDescription This function allocates memory for t