这是一个环形队列
/**
* Pushbuffer ring buffer functions.
*
* Usage proceeds as follows:
*
* (1) Wait for the pushbuffer to have space, by calling pushbufferHasSpace() with the maximum
* possible size of the pushbuffer until it returns true.
* (2) Get the current offset (address of the new pushbuffer) - pushbufferStartPush(), indicating
* the maximum possible size of the new pushbuffer.
* (3) Fill out the pushbuffer and keep track of the used size.
* (4) Once the pushbuffer has been filled out, call pushbufferEndPush() to indicate how much
* space in the current allocated pushbuffer has been used.
* (5) Submit pushbuffer.
* (6) Once the pushbuffer has been consumed, call pushbufferFreePush() to free the pushbuffer.
* Pushbuffers must be freed in order, as this is just a ring buffer.
*
* while (!pushbufferHasSpace()) { channelFlush(); Stall(); }
* nvCurrent = nvBase = pushbufferStartPush(pb, MAX_PUSH_SIZE);
* *nvCurrent++ = NV_METHOD_BS;
* length = ((uintptr_t)nvCurrent - (uintptr_t)nvBase);
* pushbufferEndPush(length);
*/
overview
下面结构体说明一切,一块内存,各一个读写指针
/**
* Pushbuffer types.
*/
typedef enum {
// Default to sysmem
CU_PUSHBUFFER_IN_SYSMEM = 0,
// Create in video memory and map over bar1
CU_PUSHBUFFER_IN_VIDMEM = 1,
} CUpushbufferFlags;
typedef struct CUpushbuffer_st CUpushbuffer;
#include "cuda_types.h"
/**
* Pushbuffer memory and ring buffer management contained within.
*/
struct CUpushbuffer_st
{
// Pushbuffer memory.
CUmemobj *memobj;
// Pushbuffer size.
size_t size;
// Memory up to (but not including) the put offset has been allocated and used for pushes.
NvU32 put;
// Memory up to (and including) the get offset is memory which is no longer in use.
NvU32 get;
// Alignment of pushbuffer segments (if applicable)
NvU32 align;
};
/**
* Create/Destroy a pushbuffer with given memory preferences.
*/
CUresult pushbufferCreate(CUctx *ctx, size_t size, CUpushbufferFlags flags, CUpushbuffer **outPushbuffer);
void pushbufferDestroy(CUpushbuffer *pushbuffer);
#define CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_CLIENT (1024*1024)
#define CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_SERVER (256*1024)
CUresult pushbufferCreate(CUctx *ctx, size_t size, CUpushbufferFlags flags, CUpushbuffer **outPushbuffer)
{
CUresult status;
CUmemdesc memdesc;
CUpushbuffer *pushbuffer;
NvBool isPushbufferVidmem = (flags & CU_PUSHBUFFER_IN_VIDMEM) != 0;
CU_TRACE_FUNCTION();
CU_ASSERT(outPushbuffer && ctx);
cuiPerformanceBegin(__FUNCTION__, CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
pushbuffer = (CUpushbuffer*)malloc(sizeof(CUpushbuffer));
if (!pushbuffer) {
status = CUDA_ERROR_OUT_OF_MEMORY;
goto Error;
}
memset(pushbuffer, 0, sizeof(CUpushbuffer));
// Allocate pushbuffer memory
memset(&memdesc, 0x0, sizeof(memdesc));
if (isPushbufferVidmem) {
memdesc.flags.location = CU_MEM_LOCATION_DEVICE;
memdesc.flags.cacheHost = CU_MEM_CACHE_HOST_DISABLED;
}
else {
memdesc.flags.location = CU_MEM_LOCATION_HOST;
memdesc.flags.cacheHost = CU_GPFIFO_AND_PUSHBUFFER_HOST_CACHE_TYPE;
}
memdesc.flags.owner = CU_MEM_OWNER_DRIVER;
memdesc.flags.type = CU_MEM_TYPE_PUSHBUFFER;
memdesc.flags.mapHost = CU_MEM_MAP_HOST_VA;
// Pushbuffer must use 40-bit VA.
memdesc.flags.mapDevice = CU_MEM_MAP_DEVICE_VA_FORCE_40_BIT;
status = memobjAlloc(
ctx->memmgr,
&memdesc,
size,
&pushbuffer->memobj);
if (status != CUDA_SUCCESS) {
CU_DEBUG_PRINT(("Unable to allocate pushbuffer\n"));
goto Error;
}
// Initialize ring buffer pointers
pushbuffer->size = size;
pushbuffer->get = 0;
pushbuffer->put = 0;
// __WAR__ HW bug 1669031
// All chips have a 96 GP_FIFO entry limit in their LB (Latency Buffer)
// At risk:
// * gm200/gm204 - 1136 PB RAM entries
// * gm206 - 848 PB RAM entries.
// * gk104/gk106/gk110/gk180 - 848 RAM entries
// * gm107/gm108 - 704 PB RAM entries
// Not at risk: (mobile chips + the tiniest desktop chip known to man)
// * gk20a/gm20b/gm21b - 104 PB RAM entries
// * gk208 - 656 PB RAM entries (unfortunately no way to distinguish so WAR anyway)
if ((cuiDeviceArchIsKepler(ctx->memmgr->device) || cuiDeviceArchIsMaxwell(ctx->memmgr->device)) &&
!cuiDeviceIsSoC(ctx->memmgr->device)) {
// Host (HW) reads the pushbuffer data into ram slots in chunks aligned to 128.
// Each chunk is 1 'request'. Each ram slot can hold 16 bytes.
// We need to keep Host's latency buffer: <= 96 GpFifo Entries, < 256 requests, < ram slots on chip
// If we hit the request limit, we encounter a HW bug - so we:
// - Align the pushbuffer segments to 128 bytes, which can be proven to not go over 255 requests
pushbuffer->align = (NvU32)ctx->memmgr->device->hal.memblockGetHeapAlignment(ctx->memmgr, &memdesc);
}
*outPushbuffer = pushbuffer;
cuiPerformanceEnd(__FUNCTION__, CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
return CUDA_SUCCESS;
Error:
if (pushbuffer) {
pushbufferDestroy(pushbuffer);
}
return status;
}
void pushbufferDestroy(CUpushbuffer *pushbuffer)
{
CU_TRACE_FUNCTION();
CU_ASSERT(pushbuffer);
if (pushbuffer->memobj) {
memobjFree(&pushbuffer->memobj);
}
free(pushbuffer);
}
操作函数
计算大小
/*
* \brief Returns the required size of pushbuffer for a channel type based on DMAL needs
*
* \param[in] ctx CUctx handle.
* \param[in] channelType the type of channel.
*
* \retval NvU32 - the size of pushbuffer.
*
*/
NvU32
pushbufferCalculateSizeForChannelType(CUctx *ctx, CUchannelType channelType);
/*
* \brief calculates total required size for pushbuffer
*
* \detailDescription This function does the following 3 things: (i) it retrieves
* the number of compute and copy channels we will have, (ii) calculates the size
* of pushbuffer per compute/copy channel, (iii) sums up the sizes of pushbuffer
* for all channels and returns it.
*
* \param[in] memmgr Memmgr structure.
* \param[in] memdesc The memory descriptor
*
* \retval NvU64 - the total calculated size for pushbuffer
*
*/
NvU64 pushbufferCalculateTotalSize(CUmemmgr* memmgr, CUmemdesc* memdesc);
NvU32
pushbufferCalculateSizeForChannelType(CUctx *ctx, CUchannelType channelType)
{
NvU32 pushbufferSize = 0;
if (cuiDeviceIsSoC(ctx->device)) {
pushbufferSize = CU_GPFIFO_PUSHBUF_ALLOC_SIZE_SOC;
}
else if (channelTypeIsAsyncMemcpy(channelType)) {
pushbufferSize = globals.pushbufferSizeCE;
}
else {
pushbufferSize = globals.pushbufferSizeCompute;
}
// Mps servers create 96 channels, and then those PB allocations are
// basically never used (rather, the clients' PB allocations are used),
// so make the allocation as small as possible.
// - this way we burn 27 MB of wasted PBs on the server, instead of 384 MB
if (cuiGlobalsIsLegacyMpsServer()) {
pushbufferSize = CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_SERVER;
}
// Mps clients will each allocate 6 channels, and we assume about
// 16 channels, so shrink the pushbuffer size a bit even on the clients.
if (cuiGlobalsIsLegacyMpsClient()) {
pushbufferSize = CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_CLIENT;
}
return pushbufferSize;
}
NvU64
pushbufferCalculateTotalSize(CUmemmgr* memmgr, CUmemdesc* memdesc)
{
CU_ASSERT(memdesc->flags.type == CU_MEM_TYPE_PUSHBUFFER);
NvU32 numComputeChannels = 0;
NvU32 numAsyncChannels = 0;
NvU64 pushbufferSizeCompute = 0;
NvU64 pushbufferSizeAsync = 0;
// Determine the number of compute and async channels
numComputeChannels = channelManagerCalculateComputeChannelCount(memmgr->ctx);
numAsyncChannels = memmgr->ctx->device->state.asyncEngineCount * channelManagerCalculateAsyncChannelCount(memmgr->ctx);
// Get the pushbuffer size per compute/async channel
pushbufferSizeCompute = pushbufferCalculateSizeForChannelType(memmgr->ctx, CU_CHANNEL_TYPE_COMPUTE);
pushbufferSizeAsync = pushbufferCalculateSizeForChannelType(memmgr->ctx, CU_CHANNEL_TYPE_ASYNC_MEMCPY_0);
// Round up the size based on the alignment that CUDA memory allocator will use for pushbuffer
pushbufferSizeCompute = ROUND_UP(pushbufferSizeCompute, memmgr->device->hal.memblockGetHeapAlignment(memmgr, memdesc));
pushbufferSizeAsync = ROUND_UP(pushbufferSizeAsync, memmgr->device->hal.memblockGetHeapAlignment(memmgr, memdesc));
return pushbufferSizeCompute * numComputeChannels + pushbufferSizeAsync * numAsyncChannels;
}
属性函数
/**
* Get the base host address of the pushbuffer allocation.
*/
void *pushbufferGetHostVaddr(const CUpushbuffer *pushbuffer);
/**
* Get the device address for the pushbuffer allocation.
*/
NvU64 pushbufferGetDeviceVaddr(const CUpushbuffer *pushbuffer);
/**
* Get the memobj for the pushbuffer block.
*/
CUmemobj *pushbufferGetMemobj(const CUpushbuffer *pushbuffer);
/**
* Get the total pushbuffer size.
*/
size_t pushbufferGetSize(const CUpushbuffer *pushbuffer);
/**
* Is the pushbuffer in host memory.
*/
NvBool pushbufferIsHostMemory(const CUpushbuffer *pushbuffer);
void *pushbufferGetHostVaddr(const CUpushbuffer *pushbuffer)
{
return (void*)(uintptr_t)memobjGetHostPtr(pushbuffer->memobj);
}
NvU64 pushbufferGetDeviceVaddr(const CUpushbuffer *pushbuffer)
{
return memobjGetDeviceVaddr(pushbuffer->memobj);
}
CUmemobj *pushbufferGetMemobj(const CUpushbuffer *pushbuffer)
{
return pushbuffer->memobj;
}
size_t pushbufferGetSize(const CUpushbuffer *pushbuffer)
{
return (size_t)(pushbuffer->size);
}
具体填充函数
/*
* Is there enough free space to write methods (of length up to size) at the pushbuffer put pointer?
*/
NvBool pushbufferHasSpace(CUpushbuffer *pushbuffer, NvU32 size);
/*
* Is there enough space to continue the current push to a new total size of size?
*/
NvBool pushbufferHasSpaceToContinueToTotalSize(CUpushbuffer *pushbuffer, NvU32 size);
/**
* Get the offset of the ring buffer's put pointer. New chunks of pushbuffer are requested here.
*/
NvU32 pushbufferStartPush(CUpushbuffer *pushbuffer, NvU32 spaceRequested);
/*
* After writing methods to the put pointer, indicate that pushbuffer space has been consumed by
* advancing the put pointer by the length of the methods written.
*/
void pushbufferEndPush(CUpushbuffer *pushbuffer, NvU32 size);
/*
* Once a pushbuffer has been consumed by the gpu, set the get pointer to indicate the pushbuffer is free.
*/
void pushbufferSetGet(CUpushbuffer *pushbuffer, NvU32 newGetPosition);
NvBool pushbufferHasSpace(CUpushbuffer *pushbuffer, NvU32 size)
{
NvU32 put;
CU_ASSERT(pushbuffer);
// When asking if the pushbuffer has space, we might not have started a new
// pushbuffer segment yet, so the 'put' pointer won't be aligned
put = pushbuffer->put;
if (pushbuffer->align) {
put = ROUND_UP(pushbuffer->put, pushbuffer->align);
}
if (pushbuffer->get > put) {
// Is there enough space before get?
return (pushbuffer->get - put) >= size;
}
else {
// Is there enough space before the end of the buffer?
if (pushbuffer->size - put >= size) {
return NV_TRUE;
}
// Wrap put around to zero, and see if there is enough space.
else {
return pushbuffer->get >= size;
}
}
}
NvBool pushbufferHasSpaceToContinueToTotalSize(CUpushbuffer *pushbuffer, NvU32 size)
{
if (pushbuffer->get > pushbuffer->put) {
// Is there enough space before get?
return (pushbuffer->get - pushbuffer->put) >= size;
}
else {
// Is there enough space before the end of the buffer?
return (pushbuffer->size - pushbuffer->put) >= size;
}
}
NvU32 pushbufferStartPush(CUpushbuffer *pushbuffer, NvU32 spaceRequested)
{
CU_ASSERT(pushbuffer);
CU_ASSERT(pushbuffer->memobj);
// Align the segment if applicable
if (pushbuffer->align) {
pushbuffer->put = ROUND_UP(pushbuffer->put, pushbuffer->align);
}
if (pushbuffer->put + spaceRequested > pushbuffer->size) {
pushbuffer->put = 0;
}
CU_ASSERT(pushbufferHasSpace(pushbuffer, spaceRequested));
return pushbuffer->put;
}
void pushbufferSetGet(CUpushbuffer *pushbuffer, NvU32 newGetPosition)
{
CU_ASSERT(pushbuffer);
CU_ASSERT(newGetPosition % 4 == 0);
CU_ASSERT((newGetPosition) <= pushbuffer->size);
pushbuffer->get = newGetPosition;
// If pushbuffer is empty reset it to 0. This reduces
// fragmentation and is used by GPFIFO overrun test.
if (pushbuffer->get == pushbuffer->put) {
pushbuffer->get = 0;
pushbuffer->put = 0;
}
}
5903

被折叠的 条评论
为什么被折叠?



