CUDA系列-pushbuffer-6

这是一个环形队列

/**
 * Pushbuffer ring buffer functions.
 *
 * Usage proceeds as follows:
 *
 * (1) Wait for the pushbuffer to have space, by calling pushbufferHasSpace() with the maximum
 *     possible size of the pushbuffer until it returns true.
 * (2) Get the current offset (address of the new pushbuffer) - pushbufferStartPush(), indicating
 *     the maximum possible size of the new pushbuffer.
 * (3) Fill out the pushbuffer and keep track of the used size.
 * (4) Once the pushbuffer has been filled out, call pushbufferEndPush() to indicate how much
 *     space in the current allocated pushbuffer has been used.
 * (5) Submit pushbuffer.
 * (6) Once the pushbuffer has been consumed, call pushbufferFreePush() to free the pushbuffer.
 *     Pushbuffers must be freed in order, as this is just a ring buffer.
 *
 * while (!pushbufferHasSpace()) { channelFlush(); Stall(); }
 * nvCurrent = nvBase = pushbufferStartPush(pb, MAX_PUSH_SIZE);
 * *nvCurrent++ = NV_METHOD_BS;
 * length = ((uintptr_t)nvCurrent - (uintptr_t)nvBase);
 * pushbufferEndPush(length);
 */

overview

下面结构体说明一切,一块内存,各一个读写指针

/**
 * Pushbuffer types.
 */
typedef enum {
    // Default to sysmem
    CU_PUSHBUFFER_IN_SYSMEM = 0,

    // Create in video memory and map over bar1
    CU_PUSHBUFFER_IN_VIDMEM = 1,
} CUpushbufferFlags;

typedef struct CUpushbuffer_st CUpushbuffer;

#include "cuda_types.h"

/**
 * Pushbuffer memory and ring buffer management contained within.
 */
struct CUpushbuffer_st
{
    // Pushbuffer memory.
    CUmemobj *memobj;

    // Pushbuffer size.
    size_t size;

    // Memory up to (but not including) the put offset has been allocated and used for pushes.
    NvU32 put;

    // Memory up to (and including) the get offset is memory which is no longer in use.
    NvU32 get;

    // Alignment of pushbuffer segments (if applicable)
    NvU32 align;
};

/**
 * Create/Destroy a pushbuffer with given memory preferences.
 */
CUresult pushbufferCreate(CUctx *ctx, size_t size, CUpushbufferFlags flags, CUpushbuffer **outPushbuffer);
void pushbufferDestroy(CUpushbuffer *pushbuffer);


#define CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_CLIENT (1024*1024)
#define CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_SERVER  (256*1024)

CUresult pushbufferCreate(CUctx *ctx, size_t size, CUpushbufferFlags flags, CUpushbuffer **outPushbuffer)
{
    CUresult status;
    CUmemdesc memdesc;
    CUpushbuffer *pushbuffer;
    NvBool isPushbufferVidmem = (flags & CU_PUSHBUFFER_IN_VIDMEM) != 0;

    CU_TRACE_FUNCTION();
    CU_ASSERT(outPushbuffer && ctx);

    cuiPerformanceBegin(__FUNCTION__, CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
    pushbuffer = (CUpushbuffer*)malloc(sizeof(CUpushbuffer));
    if (!pushbuffer) {
        status = CUDA_ERROR_OUT_OF_MEMORY;
        goto Error;
    }
    memset(pushbuffer, 0, sizeof(CUpushbuffer));

    // Allocate pushbuffer memory
    memset(&memdesc, 0x0, sizeof(memdesc));
    if (isPushbufferVidmem) {
        memdesc.flags.location        = CU_MEM_LOCATION_DEVICE;
        memdesc.flags.cacheHost       = CU_MEM_CACHE_HOST_DISABLED;
    }
    else {
        memdesc.flags.location        = CU_MEM_LOCATION_HOST;
        memdesc.flags.cacheHost       = CU_GPFIFO_AND_PUSHBUFFER_HOST_CACHE_TYPE;
    }
    memdesc.flags.owner           = CU_MEM_OWNER_DRIVER;
    memdesc.flags.type            = CU_MEM_TYPE_PUSHBUFFER;
    memdesc.flags.mapHost         = CU_MEM_MAP_HOST_VA;
    // Pushbuffer must use 40-bit VA.
    memdesc.flags.mapDevice       = CU_MEM_MAP_DEVICE_VA_FORCE_40_BIT;

    status = memobjAlloc(
        ctx->memmgr,
        &memdesc,
        size,
        &pushbuffer->memobj);
    if (status != CUDA_SUCCESS) {
        CU_DEBUG_PRINT(("Unable to allocate pushbuffer\n"));
        goto Error;
    }

    // Initialize ring buffer pointers
    pushbuffer->size = size;
    pushbuffer->get = 0;
    pushbuffer->put = 0;

    // __WAR__ HW bug 1669031
    // All chips have a 96 GP_FIFO entry limit in their LB (Latency Buffer)
    // At risk:
    // * gm200/gm204 - 1136 PB RAM entries
    // * gm206 - 848 PB RAM entries.
    // * gk104/gk106/gk110/gk180 - 848 RAM entries
    // * gm107/gm108 - 704 PB RAM entries
    // Not at risk: (mobile chips + the tiniest desktop chip known to man)
    // * gk20a/gm20b/gm21b - 104 PB RAM entries
    // * gk208 - 656 PB RAM entries (unfortunately no way to distinguish so WAR anyway)
    if ((cuiDeviceArchIsKepler(ctx->memmgr->device) || cuiDeviceArchIsMaxwell(ctx->memmgr->device)) &&
        !cuiDeviceIsSoC(ctx->memmgr->device)) {
        // Host (HW) reads the pushbuffer data into ram slots in chunks aligned to 128.
        // Each chunk is 1 'request'. Each ram slot can hold 16 bytes.
        // We need to keep Host's latency buffer: <= 96 GpFifo Entries, < 256 requests, < ram slots on chip
        // If we hit the request limit, we encounter a HW bug - so we:
        //  - Align the pushbuffer segments to 128 bytes, which can be proven to not go over 255 requests
        pushbuffer->align = (NvU32)ctx->memmgr->device->hal.memblockGetHeapAlignment(ctx->memmgr, &memdesc);
    }

    *outPushbuffer = pushbuffer;
    cuiPerformanceEnd(__FUNCTION__, CUDA_PERF_GROUP_CTX_CREATE, CUDA_PERF_SUBGROUP_ALL);
    return CUDA_SUCCESS;

Error:
    if (pushbuffer) {
        pushbufferDestroy(pushbuffer);
    }
    return status;
}

void pushbufferDestroy(CUpushbuffer *pushbuffer)
{
    CU_TRACE_FUNCTION();
    CU_ASSERT(pushbuffer);

    if (pushbuffer->memobj) {
        memobjFree(&pushbuffer->memobj);
    }
    free(pushbuffer);
}

操作函数

计算大小

/*
 * \brief Returns the required size of pushbuffer for a channel type based on DMAL needs
 *
 * \param[in] ctx CUctx handle.
 * \param[in] channelType the type of channel.
 *
 * \retval NvU32 - the size of pushbuffer.
 *
 */
NvU32
pushbufferCalculateSizeForChannelType(CUctx *ctx, CUchannelType channelType);
/*
 * \brief calculates total required size for pushbuffer
 *
 * \detailDescription This function does the following 3 things: (i) it retrieves
 * the number of compute and copy channels we will have, (ii) calculates the size
 * of pushbuffer per compute/copy channel, (iii) sums up the sizes of pushbuffer 
 * for all channels and returns it.
 *
 * \param[in] memmgr Memmgr structure.
 * \param[in] memdesc The memory descriptor
 *
 * \retval NvU64 - the total calculated size for pushbuffer
 *
 */
NvU64 pushbufferCalculateTotalSize(CUmemmgr* memmgr, CUmemdesc* memdesc);



NvU32
pushbufferCalculateSizeForChannelType(CUctx *ctx, CUchannelType channelType)
{
    NvU32 pushbufferSize = 0;

    if (cuiDeviceIsSoC(ctx->device)) {
        pushbufferSize = CU_GPFIFO_PUSHBUF_ALLOC_SIZE_SOC;
    }
    else if (channelTypeIsAsyncMemcpy(channelType)) {
        pushbufferSize = globals.pushbufferSizeCE;
    }
    else {
        pushbufferSize = globals.pushbufferSizeCompute;
    }

    // Mps servers create 96 channels, and then those PB allocations are 
    // basically never used (rather, the clients' PB allocations are used),
    // so make the allocation as small as possible.
    // - this way we burn 27 MB of wasted PBs on the server, instead of 384 MB
    if (cuiGlobalsIsLegacyMpsServer()) {
        pushbufferSize = CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_SERVER;
    }

    // Mps clients will each allocate 6 channels, and we assume about
    // 16 channels, so shrink the pushbuffer size a bit even on the clients.
    if (cuiGlobalsIsLegacyMpsClient()) {
        pushbufferSize = CU_GPFIFO_PUSHBUF_ALLOC_SIZE_MPS_CLIENT;
    }

    return pushbufferSize;
}

NvU64
pushbufferCalculateTotalSize(CUmemmgr* memmgr, CUmemdesc* memdesc)
{
    CU_ASSERT(memdesc->flags.type == CU_MEM_TYPE_PUSHBUFFER);

    NvU32 numComputeChannels = 0;
    NvU32 numAsyncChannels = 0;

    NvU64 pushbufferSizeCompute = 0;
    NvU64 pushbufferSizeAsync = 0;

    // Determine the number of compute and async channels
    numComputeChannels = channelManagerCalculateComputeChannelCount(memmgr->ctx);
    numAsyncChannels = memmgr->ctx->device->state.asyncEngineCount * channelManagerCalculateAsyncChannelCount(memmgr->ctx);

    // Get the pushbuffer size per compute/async channel
    pushbufferSizeCompute = pushbufferCalculateSizeForChannelType(memmgr->ctx, CU_CHANNEL_TYPE_COMPUTE);
    pushbufferSizeAsync = pushbufferCalculateSizeForChannelType(memmgr->ctx, CU_CHANNEL_TYPE_ASYNC_MEMCPY_0);

    // Round up the size based on the alignment that CUDA memory allocator will use for pushbuffer 
    pushbufferSizeCompute = ROUND_UP(pushbufferSizeCompute, memmgr->device->hal.memblockGetHeapAlignment(memmgr, memdesc));
    pushbufferSizeAsync = ROUND_UP(pushbufferSizeAsync, memmgr->device->hal.memblockGetHeapAlignment(memmgr, memdesc));

    return pushbufferSizeCompute * numComputeChannels + pushbufferSizeAsync * numAsyncChannels;
}

属性函数

/**
 * Get the base host address of the pushbuffer allocation.
 */
void *pushbufferGetHostVaddr(const CUpushbuffer *pushbuffer);

/**
 * Get the device address for the pushbuffer allocation.
 */
NvU64 pushbufferGetDeviceVaddr(const CUpushbuffer *pushbuffer);

/**
 * Get the memobj for the pushbuffer block.
 */
CUmemobj *pushbufferGetMemobj(const CUpushbuffer *pushbuffer);

/**
 * Get the total pushbuffer size.
 */
size_t pushbufferGetSize(const CUpushbuffer *pushbuffer);

/**
 * Is the pushbuffer in host memory.
 */
NvBool pushbufferIsHostMemory(const CUpushbuffer *pushbuffer);


void *pushbufferGetHostVaddr(const CUpushbuffer *pushbuffer)
{
    return (void*)(uintptr_t)memobjGetHostPtr(pushbuffer->memobj);
}

NvU64 pushbufferGetDeviceVaddr(const CUpushbuffer *pushbuffer)
{
    return memobjGetDeviceVaddr(pushbuffer->memobj);
}

CUmemobj *pushbufferGetMemobj(const CUpushbuffer *pushbuffer)
{
    return pushbuffer->memobj;
}

size_t pushbufferGetSize(const CUpushbuffer *pushbuffer)
{
    return (size_t)(pushbuffer->size);
}

具体填充函数

/*
 * Is there enough free space to write methods (of length up to size) at the pushbuffer put pointer?
 */
NvBool pushbufferHasSpace(CUpushbuffer *pushbuffer, NvU32 size);


/*
 * Is there enough space to continue the current push to a new total size of size?
 */
NvBool pushbufferHasSpaceToContinueToTotalSize(CUpushbuffer *pushbuffer, NvU32 size);


/**
 * Get the offset of the ring buffer's put pointer. New chunks of pushbuffer are requested here.
 */
NvU32 pushbufferStartPush(CUpushbuffer *pushbuffer, NvU32 spaceRequested);

/*
 * After writing methods to the put pointer, indicate that pushbuffer space has been consumed by
 * advancing the put pointer by the length of the methods written.
 */
void pushbufferEndPush(CUpushbuffer *pushbuffer, NvU32 size);

/*
 * Once a pushbuffer has been consumed by the gpu, set the get pointer to indicate the pushbuffer is free.
 */
void pushbufferSetGet(CUpushbuffer *pushbuffer, NvU32 newGetPosition);

NvBool pushbufferHasSpace(CUpushbuffer *pushbuffer, NvU32 size)
{
    NvU32 put;

    CU_ASSERT(pushbuffer);
    // When asking if the pushbuffer has space, we might not have started a new
    // pushbuffer segment yet, so the 'put' pointer won't be aligned
    put = pushbuffer->put;
    if (pushbuffer->align) {
        put = ROUND_UP(pushbuffer->put, pushbuffer->align);
    }
    if (pushbuffer->get > put) {
        // Is there enough space before get?
        return (pushbuffer->get - put) >= size;
    }
    else {
        // Is there enough space before the end of the buffer?
        if (pushbuffer->size - put >= size) {
            return NV_TRUE;
        }
        // Wrap put around to zero, and see if there is enough space.
        else {
            return pushbuffer->get >= size;
        }
    }
}

NvBool pushbufferHasSpaceToContinueToTotalSize(CUpushbuffer *pushbuffer, NvU32 size)
{
    if (pushbuffer->get > pushbuffer->put) {
        // Is there enough space before get?
        return (pushbuffer->get - pushbuffer->put) >= size;
    }
    else {
        // Is there enough space before the end of the buffer?
        return (pushbuffer->size - pushbuffer->put) >= size;
    }
}

NvU32 pushbufferStartPush(CUpushbuffer *pushbuffer, NvU32 spaceRequested)
{
    CU_ASSERT(pushbuffer);
    CU_ASSERT(pushbuffer->memobj);

    // Align the segment if applicable
    if (pushbuffer->align) {
        pushbuffer->put = ROUND_UP(pushbuffer->put, pushbuffer->align);
    }
    if (pushbuffer->put + spaceRequested > pushbuffer->size) {
        pushbuffer->put = 0;
    }
    CU_ASSERT(pushbufferHasSpace(pushbuffer, spaceRequested));
    return pushbuffer->put;
}


void pushbufferSetGet(CUpushbuffer *pushbuffer, NvU32 newGetPosition)
{
    CU_ASSERT(pushbuffer);
    CU_ASSERT(newGetPosition % 4 == 0);
    CU_ASSERT((newGetPosition) <= pushbuffer->size);

    pushbuffer->get = newGetPosition;

    // If pushbuffer is empty reset it to 0. This reduces
    // fragmentation and is used by GPFIFO overrun test.
    if (pushbuffer->get == pushbuffer->put) {
        pushbuffer->get = 0;
        pushbuffer->put = 0;
    }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值