Oerlap即边计算边传输,会节约时间。例子是:
// OpenCL Kernel Function Naive Implementation for hyptenuse
__kernel void VectorHypot(__global float4* fg4A, __global float4* fg4B, __global float4* fg4Hypot, unsigned int uiOffset, int iInnerLoopCount, unsigned int uiNumElements)
{
// get index into global data array
size_t szGlobalOffset = get_global_id(0) + uiOffset;
// bound check
if (szGlobalOffset >= uiNumElements)
{
return;
}
// Processing 4 elements per work item, so read fgA and fgB source values from GMEM
float4 f4A = fg4A[szGlobalOffset];
float4 f4B = fg4B[szGlobalOffset];
float4 f4H = (float4)0.0f;
// Get the hypotenuses the vectors of 'legs', but exaggerate the time needed with loop
for (int i = 0; i < iInnerLoopCount; i++)
{
// compute the 4 hypotenuses using built-in function
f4H.x = hypot (f4A.x, f4B.x);
f4H.y = hypot (f4A.y, f4B.y);
f4H.z = hypot (f4A.z, f4B.z);
f4H.w = hypot (f4A.w, f4B.w);
}
// Write 4 result values back out to GMEM
fg4Hypot[szGlobalOffset] = f4H;
}
main部分:
#include "oclUtils.h"
#include "shrQATest.h"
// Best possible and Min ratio of compute/copy overlap timing benefit to pass the test
// values greater than 0.0f represent a speed-up relative to non-overlapped
#define EXPECTED_OVERLAP 30.0f
#define EXPECTED_OVERLAP_FERMI 45.0f
#define PASS_FACTOR 0.60f
#define RETRIES_ON_FAILURE 1
// Base sizes for parameters manipulated dynamically or on the command line
#define BASE_WORK_ITEMS 64
#define BASE_ARRAY_LENGTH 40000
#define BASE_LOOP_COUNT 32
// Vars
// *********************************************************************
cl_platform_id cpPlatform; // OpenCL platform
cl_context cxGPUContext; // OpenCL context
cl_command_queue cqCommandQueue[2]; // OpenCL command queues
cl_device_id* cdDevices; // OpenCL device list
cl_program cpProgram; // OpenCL program
cl_kernel ckKernel[2]; // OpenCL kernel, 1 per queue
cl_mem cmPinnedSrcA; // OpenCL pinned host source buffer A
cl_mem cmPinnedSrcB; // OpenCL pinned host source buffer B
cl_mem cmPinnedResult; // OpenCL pinned host result buffer
float* fSourceA = NULL; // Mapped pointer for pinned Host source A buffer
float* fSourceB = NULL; // Mapped pointer for pinned Host source B buffer
float* fResult = NULL; // Mapped pointer for pinned Host result buffer
cl_mem cmDevSrcA; // OpenCL device source buffer A
cl_mem cmDevSrcB; // OpenCL device source buffer B
cl_mem cmDevResult; // OpenCL device result buffer
size_t szBuffBytes; // Size of main buffers
size_t szGlobalWorkSize; // 1D var for Total # of work items in the launched ND range
size_t szLocalWorkSize = BASE_WORK_ITEMS; // initial # of work items in the work group
cl_int ciErrNum; // Error code var
char* cPathAndName = NULL; // Var for full paths to data, src, etc.
char* cSourceCL = NULL; // Buffer to hold source for compilation
const char* cExecutableName = NULL;
// demo config vars
const char* cSourceFile = "VectorHypot.cl"; // OpenCL computation kernel source code
float* Golden = NULL; // temp buffer to hold golden results for cross check
bool bNoPrompt = false; // Command line switch to skip exit prompt
bool bQATest = false; // Command line switch to test
// Forward Declarations
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig);
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitialLoopCount, int iCycles);
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount);
void Cleanup (int iExitCode);
void (*pCleanup)(int) = &Cleanup;
int *gp_argc = 0;
const char *** gp_argv = NULL;
// Main function
// *********************************************************************
int main(int argc, const char **argv)
{
//Locals
size_t szKernelLength; // Byte size of kernel code
double dBuildTime; // Compile time
cl_uint uiTargetDevice = 0; // Default Device to compute on
cl_uint uiNumDevsUsed = 1; // Number of devices used in this sample
cl_uint uiNumDevices; // Number of devices available
int iDevCap = -1; // Capability of device
int iInnerLoopCount = BASE_LOOP_COUNT; // Varies "compute intensity" per data within the kernel
const int iTestCycles = 10; // How many times to run the external test loop
const int iWarmupCycles = 8; // How many times to run the warmup sequence
cl_uint uiWorkGroupMultiple = 4; // Command line var (using "workgroupmult=<n>") to optionally increase workgroup size
cl_uint uiNumElements = BASE_ARRAY_LENGTH; // initial # of elements per array to process (note: procesing 4 per work item)
cl_uint uiSizeMultiple = 4; // Command line var (using "sizemult=<n>") to optionally increase vector sizes
bool bPassFlag = false; // Var to accumulate test pass/fail
shrBOOL bMatch = shrFALSE; // Cross check result
shrBOOL bTestOverlap = shrFALSE;
double dAvgGPUTime[2] = {0.0, 0.0}; // Average time of iTestCycles calls for 2-Queue and 1-Queue test
double dHostTime[2] = {0.0, 0.0}; // Host computation time (2nd test is redundant but a good stability indicator)
float fMinPassCriteria[2] = {0.0f, 0.0f}; // Test pass cireria, adjusted dependant on GPU arch
gp_argc = &argc;
gp_argv = &argv;
shrQAStart(argc, (char **)argv);
// start logs
cExecutableName = argv[0];
shrSetLogFileName ("oclCopyComputeOverlap.txt");
shrLog("%s Starting...\n\n", argv[0]);
// get basic command line args
bNoPrompt = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "noprompt"));
bQATest = (shrTRUE == shrCheckCmdLineFlag(argc, argv, "qatest"));
shrGetCmdLineArgumentu(argc, argv, "device", &uiTargetDevice);
// Optional Command-line multiplier for vector size
// Default val of 4 gives 10.24 million float elements per vector
// Range of 3 - 16 (7.68 to 40.96 million floats) is reasonable range (if system and GPU have enough memory)
shrGetCmdLineArgumentu(argc, argv, "sizemult", &uiSizeMultiple);
uiSizeMultiple = CLAMP(uiSizeMultiple, 1, 50);
uiNumElements = uiSizeMultiple * BASE_ARRAY_LENGTH * BASE_WORK_ITEMS;
shrLog("Array sizes = %u float elements\n", uiNumElements);
// Optional Command-line multiplier for workgroup size (x 64 work items)
// Default val of 4 gives szLocalWorkSize of 256.
// Range of 1 - 8 (resulting in workgroup sizes of 64 to 512) is reasonable range
shrGetCmdLineArgumentu(argc, argv, "workgroupmult", &uiWorkGroupMultiple);
uiWorkGroupMultiple = CLAMP(uiWorkGroupMultiple, 1, 10);
szLocalWorkSize = uiWorkGroupMultiple * BASE_WORK_ITEMS;
shrLog("Workgroup Size = %u\n\n", szLocalWorkSize);
// Get the NVIDIA platform if available, otherwise use default
shrLog("Get the Platform ID...\n\n");
ciErrNum = oclGetPlatformID(&cpPlatform);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Get OpenCL platform name and version
char cBuffer[256];
ciErrNum = clGetPlatformInfo (cpPlatform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("Platform Name = %s\n\n", cBuffer);
// Get all the devices
shrLog("Get the Device info and select Device...\n");
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cdDevices = (cl_device_id*)malloc(uiNumDevices * sizeof(cl_device_id));
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Set target device and check capabilities
shrLog(" # of Devices Available = %u\n", uiNumDevices);
uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1));
shrLog(" Using Device %u, ", uiTargetDevice);
oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]);
iDevCap = oclGetDevCap(cdDevices[uiTargetDevice]);
if (iDevCap > 0) {
shrLog(", Capability = %d.%d\n\n", iDevCap/10, iDevCap%10);
} else {
shrLog("\n\n", iDevCap);
}
if (strstr(cBuffer, "NVIDIA") != NULL)
{
if (iDevCap < 12)
{
shrLog("Device doesn't have overlap capability. Skipping test...\n");
Cleanup (EXIT_SUCCESS);
}
// Device and Platform eligible for overlap testing
bTestOverlap = shrTRUE;
// If device has overlap capability, proceed
fMinPassCriteria[0] = PASS_FACTOR * EXPECTED_OVERLAP; // 1st cycle overlap is same for 1 or 2 copy engines
if (iDevCap != 20)
{
// Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // avg of many cycles
}
else
{
char cDevName[1024];
clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_NAME, sizeof(cDevName), &cDevName, NULL);
if(strstr(cDevName, "Quadro")!=0 || strstr(cDevName, "Tesla")!=0)
{
// Tesla or Quadro (arch = 2.0) ... Dual copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP_FERMI; // average of many cycles
}
else
{
// Geforce ... Single copy engine
fMinPassCriteria[1] = PASS_FACTOR * EXPECTED_OVERLAP; // average of many cycles
}
}
}
// Create the context
shrLog("clCreateContext...\n");
cxGPUContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Create 2 command-queues
cqCommandQueue[0] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [0]...\n");
cqCommandQueue[1] = clCreateCommandQueue(cxGPUContext, cdDevices[uiTargetDevice], 0, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateCommandQueue [1]...\n");
// Allocate the OpenCL source and result buffer memory objects on GPU device GMEM
szBuffBytes = sizeof(cl_float) * uiNumElements;
cmDevSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmDevResult = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x %u floats) ...\n", uiNumElements);
// Allocate pinned source and result host buffers:
// Note: Pinned (Page Locked) memory is needed for async host<->GPU memory copy operations ***
cmPinnedSrcA = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedSrcB = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
cmPinnedResult = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, szBuffBytes, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x %u floats)...\n\n", uiNumElements);
// Get mapped pointers to pinned input host buffers
// Note: This allows general (non-OpenCL) host functions to access pinned buffers using standard pointers
fSourceA = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcA, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fSourceB = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedSrcB, CL_TRUE, CL_MAP_WRITE, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
fResult = (cl_float*)clEnqueueMapBuffer(cqCommandQueue[0], cmPinnedResult, CL_TRUE, CL_MAP_READ, 0, szBuffBytes, 0, NULL, NULL, &ciErrNum);
oclCheckErrorEX (ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...\n");
// Alloc temp golden buffer for cross checks
Golden = (float*)malloc(szBuffBytes);
oclCheckErrorEX(Golden != NULL, shrTRUE, pCleanup);
// Read the OpenCL kernel in from source file
cPathAndName = shrFindFilePath(cSourceFile, argv[0]);
oclCheckError(cPathAndName != NULL, shrTRUE);
cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("oclLoadProgSource (%s)...\n", cSourceFile);
// Create the program object
cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateProgramWithSource...\n");
// Build the program for the target device
clFinish(cqCommandQueue[0]);
shrDeltaT(0);
ciErrNum = clBuildProgram(cpProgram, uiNumDevsUsed, &cdDevices[uiTargetDevice], "-cl-fast-relaxed-math", NULL, NULL);
shrLog("clBuildProgram...");
if (ciErrNum != CL_SUCCESS)
{
// write out standard error, Build Log and PTX, then cleanup and exit
shrLogEx(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR);
oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext));
oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "VectorHypot.ptx");
Cleanup(EXIT_FAILURE);
}
dBuildTime = shrDeltaT(0);
// Create the kernel
ckKernel[0] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
ckKernel[1] = clCreateKernel(cpProgram, "VectorHypot", &ciErrNum);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clCreateKernel (ckKernel[2])...\n");
// Offsets for 2 queues
cl_uint uiOffset[2] = {0, uiNumElements / (2 * 4)};
// Set the Argument values for the 1st kernel instance (queue 0)
ciErrNum = clSetKernelArg(ckKernel[0], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[0], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[0], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[0], 3, sizeof(cl_uint), (void*)&uiOffset[0]);
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[0], 5, sizeof(cl_uint), (void*)&uiNumElements);
shrLog("clSetKernelArg ckKernel[0] args 0 - 5...\n");
// Set the Argument values for the 2d kernel instance (queue 1)
ciErrNum |= clSetKernelArg(ckKernel[1], 0, sizeof(cl_mem), (void*)&cmDevSrcA);
ciErrNum |= clSetKernelArg(ckKernel[1], 1, sizeof(cl_mem), (void*)&cmDevSrcB);
ciErrNum |= clSetKernelArg(ckKernel[1], 2, sizeof(cl_mem), (void*)&cmDevResult);
ciErrNum |= clSetKernelArg(ckKernel[1], 3, sizeof(cl_uint), (void*)&uiOffset[1]);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iInnerLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 5, sizeof(cl_uint), (void*)&uiNumElements);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
shrLog("clSetKernelArg ckKernel[1] args 0 - 5...\n\n");
//*******************************************
// Warmup the driver with dual queue sequence
//*******************************************
// Warmup with dual queue sequence for iTestCycles
shrLog("Warmup with 2-Queue sequence, %d cycles...\n", iWarmupCycles);
DualQueueSequence(iWarmupCycles, uiNumElements, false);
// Use single queue config to adjust compute intensity
shrLog("Adjust compute for GPU / system...\n");
iInnerLoopCount = AdjustCompute(cdDevices[uiTargetDevice], uiNumElements, iInnerLoopCount, iTestCycles);
shrLog(" Kernel inner loop count = %d\n", iInnerLoopCount);
//*******************************************
// Run and time with 2 command-queues
//*******************************************
for( int iRun =0; iRun <= RETRIES_ON_FAILURE; ++iRun ) {
// Run the sequence iTestCycles times
dAvgGPUTime[0] = DualQueueSequence(iTestCycles, uiNumElements, false);
// Warmup then Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
VectorHypotHost(fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost (fSourceA, fSourceB, Golden, uiNumElements, iInnerLoopCount);
}
dHostTime[0] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag = (bMatch == shrTRUE);
//*******************************************
// Run and time with 1 command queue
//*******************************************
// Run the sequence iTestCycles times
dAvgGPUTime[1] = OneQueueSequence(iTestCycles, uiNumElements, false);
// Compute on host iTestCycles times (using mapped standard pointer to pinned host cl_mem buffer)
shrLog(" Device vs Host Result Comparison\t: ");
shrDeltaT(0);
for (int i = 0; i < iTestCycles; i++)
{
VectorHypotHost(fSourceA, fSourceB, Golden, (int)uiNumElements, iInnerLoopCount);
}
dHostTime[1] = shrDeltaT(0)/iTestCycles;
// Compare host and GPU results (using mapped standard pointer to pinned host cl_mem buffer)
bMatch = shrComparefet(Golden, fResult, uiNumElements, 0.0f, 0);
shrLog("gpu %s cpu\n", (bMatch == shrTRUE) ? "MATCHES" : "DOESN'T MATCH");
bPassFlag &= (bMatch == shrTRUE);
//*******************************************
// Compare Single and Dual queue timing
shrLog("\nResult Summary:\n");
// Log GPU and CPU Time for 2-queue scenario
shrLog(" Avg GPU Elapsed Time for 2-Queues\t= %.5f s\n", dAvgGPUTime[0]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[0]);
// Log GPU and CPU Time for 1-queue scenario
shrLog(" Avg GPU Elapsed Time for 1-Queue\t= %.5f s\n", dAvgGPUTime[1]);
shrLog(" Avg Host Elapsed Time\t\t\t= %.5f s\n\n", dHostTime[1]);
// Log overlap % for GPU (comparison of 2-queue and 1 queue scenarios) and status
double dAvgOverlap = 100.0 * (1.0 - dAvgGPUTime[0]/dAvgGPUTime[1]);
if( bTestOverlap ) {
bool bAvgOverlapOK = (dAvgOverlap >= fMinPassCriteria[1]);
if( iRun == RETRIES_ON_FAILURE || bAvgOverlapOK ) {
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Measured Overlap is %s\n\n", dAvgOverlap, fMinPassCriteria[1], bAvgOverlapOK ? "Acceptable" : "NOT Acceptable");
// Log info to master log in standard format
shrLogEx(LOGBOTH | MASTER, 0, "oclCopyComputeOverlap-Avg, Throughput = %.4f OverlapPercent, Time = %.5f s, Size = %u Elements, NumDevsUsed = %u, Workgroup = %u\n",
dAvgOverlap, dAvgGPUTime[0], uiNumElements, uiNumDevsUsed, szLocalWorkSize);
bPassFlag &= bAvgOverlapOK;
break;
}
}
shrLog(" Measured and (Acceptable) Avg Overlap\t= %.1f %% (%.1f %%) -> Retry %d more time(s)...\n\n", dAvgOverlap, fMinPassCriteria[1], RETRIES_ON_FAILURE - iRun);
}
//*******************************************
// Report pass/fail, cleanup and exit
Cleanup (bPassFlag ? EXIT_SUCCESS : EXIT_FAILURE);
}
// Run 1 queue sequence for n cycles
// *********************************************************************
double OneQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
double dAvgTime = 0.0;
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the sequence iCycles times
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szBuffBytes, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Flush sequence to device (may not be necessary on Linux or WinXP or when using the NVIDIA Tesla Computing Cluster driver)
clFlush(cqCommandQueue[0]);
}
// *** Assure sync to host and return average sequence time
clFinish(cqCommandQueue[0]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n1-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 1\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Run 2 queue sequence for n cycles
// *********************************************************************
double DualQueueSequence(int iCycles, unsigned int uiNumElements, bool bShowConfig)
{
// Locals
size_t szHalfBuffer = szBuffBytes / 2;
size_t szHalfOffset = szHalfBuffer / sizeof(float);
double dAvgTime = 0.0;
// Use fresh source Data: (re)initialize pinned host array buffers (using mapped standard pointer to pinned host cl_mem buffer)
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Set Global work size for 2 command-queues, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/(2 * 4)));
// Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
for (int i = 0; i < iCycles; i++)
{
// Mid Phase 0
// Nonblocking Write of 1st half of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szHalfBuffer, (void*)&fSourceB[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the write for queue 0 (and prior read from queue 1 at end of loop) to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 1 ***********************************
// Launch kernel computation, command-queue 0
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Nonblocking Write of 2nd half of input data from host to device in command-queue 1
// (Note: The order MATTERS here on Fermi ! THE KERNEL IN THIS PHASE SHOULD BE LAUNCHED BEFORE THE WRITE)
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcA, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceA[szHalfOffset], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[1], cmDevSrcB, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fSourceB[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 0 and write for queue 1 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 2 ***********************************
// Launch kernel computation, command-queue 1
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[1], ckKernel[1], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
// Non Blocking Read of 1st half of output data from device to host, command-queue 0
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[0], cmDevResult, CL_FALSE, 0, szHalfBuffer, (void*)&fResult[0], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
// Push out the compute for queue 1 and the read for queue 0 to the driver
// (not necessary on Linux, Mac OSX or WinXP)
clFlush(cqCommandQueue[0]);
clFlush(cqCommandQueue[1]);
// Start Phase 0 (Rolls over) ***********************************
// Non Blocking Read of 2nd half of output data from device to host, command-queue 1
ciErrNum = clEnqueueReadBuffer(cqCommandQueue[1], cmDevResult, CL_FALSE, szHalfBuffer, szHalfBuffer, (void*)&fResult[szHalfOffset], 0, NULL, NULL);
shrCheckError(ciErrNum, CL_SUCCESS);
}
// *** Sync to host and get average sequence time
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
dAvgTime = shrDeltaT(0)/(double)iCycles;
// Log config if asked for
if (bShowConfig)
{
shrLog("\n2-Queue sequence Configuration:\n");
shrLog(" Global Work Size (per command-queue)\t= %u\n Local Work Size \t\t\t= %u\n # of Work Groups (per command-queue)\t= %u\n # of command-queues\t\t\t= 2\n",
szGlobalWorkSize, szLocalWorkSize, szGlobalWorkSize/szLocalWorkSize);
}
return dAvgTime;
}
// Function to adjust compute task according to device capability
// This allows a consistent overlap % across a wide variety of GPU's for test purposes
// It also implitly illustrates the relationship between compute capability and overlap at fixed work size
// *********************************************************************
int AdjustCompute(cl_device_id cdTargetDevice, unsigned int uiNumElements, int iInitLoopCount, int iCycles)
{
// Locals
double dCopyTime, dComputeTime;
int iComputedLoopCount;
// Change Source Data
shrFillArray(fSourceA, (int)uiNumElements);
shrFillArray(fSourceB, (int)uiNumElements);
// Reset Global work size for 1 command-queue, and log work sizes & dimensions
szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, (int)(uiNumElements/4));
// *** Make sure queues are empty and then start timer
clFinish(cqCommandQueue[0]);
clFinish(cqCommandQueue[1]);
shrDeltaT(0);
// Run the copy iCycles times and measure copy time on this system
for (int i = 0; i < iCycles; i++)
{
// Nonblocking Write of all of input data from host to device in command-queue 0
ciErrNum = clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcA, CL_FALSE, 0, szBuffBytes, (void*)&fSourceA[0], 0, NULL, NULL);
ciErrNum |= clEnqueueWriteBuffer(cqCommandQueue[0], cmDevSrcB, CL_FALSE, 0, szBuffBytes, (void*)&fSourceB[0], 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
shrCheckError(ciErrNum, CL_SUCCESS);
}
clFinish(cqCommandQueue[0]);
dCopyTime = shrDeltaT(0);
// Run the compute iCycles times and measure compute time on this system
for (int i = 0; i < iCycles; i++)
{
// Launch kernel computation, command-queue 0
ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue[0], ckKernel[0], 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, NULL);
ciErrNum |= clFlush(cqCommandQueue[0]);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
}
clFinish(cqCommandQueue[0]);
dComputeTime = shrDeltaT(0);
// Determine number of core loop cycles proportional to copy/compute time ratio
dComputeTime = MAX(dComputeTime, 1.0e-6);
iComputedLoopCount = CLAMP(2, (int)((dCopyTime/dComputeTime) * (double)iInitLoopCount), (iInitLoopCount * 4));
ciErrNum |= clSetKernelArg(ckKernel[0], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
ciErrNum |= clSetKernelArg(ckKernel[1], 4, sizeof(cl_int), (void*)&iComputedLoopCount);
oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
return (iComputedLoopCount);
}
// Cleanup/Exit function
// *********************************************************************
void Cleanup (int iExitCode)
{
// Cleanup allocated objects
shrLog("Starting Cleanup...\n\n");
if(cPathAndName)free(cPathAndName);
if(cSourceCL)free(cSourceCL);
if(Golden)free(Golden);
if(ckKernel[0])clReleaseKernel(ckKernel[0]);
if(ckKernel[1])clReleaseKernel(ckKernel[1]);
if(cpProgram)clReleaseProgram(cpProgram);
if(fSourceA)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcA, (void*)fSourceA, 0, NULL, NULL);
if(fSourceB)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedSrcB, (void*)fSourceB, 0, NULL, NULL);
if(fResult)clEnqueueUnmapMemObject(cqCommandQueue[0], cmPinnedResult, (void*)fResult, 0, NULL, NULL);
if(cmDevSrcA)clReleaseMemObject(cmDevSrcA);
if(cmDevSrcB)clReleaseMemObject(cmDevSrcB);
if(cmDevResult)clReleaseMemObject(cmDevResult);
if(cmPinnedSrcA)clReleaseMemObject(cmPinnedSrcA);
if(cmPinnedSrcB)clReleaseMemObject(cmPinnedSrcB);
if(cmPinnedResult)clReleaseMemObject(cmPinnedResult);
if(cqCommandQueue[0])clReleaseCommandQueue(cqCommandQueue[0]);
if(cqCommandQueue[1])clReleaseCommandQueue(cqCommandQueue[1]);
if(cxGPUContext)clReleaseContext(cxGPUContext);
if(cdDevices)free(cdDevices);
// Master status Pass/Fail (all tests)
shrQAFinishExit( *gp_argc, (const char **)*gp_argv, (iExitCode == EXIT_SUCCESS) ? QA_PASSED : QA_FAILED );
}
// "Golden" Host processing vector hyptenuse function for comparison purposes
// *********************************************************************
void VectorHypotHost(const float* pfData1, const float* pfData2, float* pfResult, unsigned int uiNumElements, int iInnerLoopCount)
{
for (unsigned int i = 0; i < uiNumElements; i++)
{
float fA = pfData1[i];
float fB = pfData2[i];
float fC = sqrtf(fA * fA + fB * fB);
pfResult[i] = fC;
}
}
结果是:
Array sizes = 10240000 float elements
Workgroup Size = 256
Get the Platform ID...
Platform Name = NVIDIA CUDA
Get the Device info and select Device...
# of Devices Available = 1
Using Device 0, GeForce GTX 750 Ti, Capability = 5.0
clCreateContext...
clCreateCommandQueue [0]...
clCreateCommandQueue [1]...
clCreateBuffer (Src A, Src B and Result GPU Device GMEM, 3 x 10240000 floats) ...
clCreateBuffer (Src A, Src B and Result Pinned Host buffers, 3 x 10240000 floats)...
clEnqueueMapBuffer (Pointers to 3 pinned host buffers)...
oclLoadProgSource (VectorHypot.cl)...
clCreateProgramWithSource...
clBuildProgram...clCreateKernel (ckKernel[2])...
clSetKernelArg ckKernel[0] args 0 - 5...
clSetKernelArg ckKernel[1] args 0 - 5...
Warmup with 2-Queue sequence, 8 cycles...
Adjust compute for GPU / system...
Kernel inner loop count = 2
Device vs Host Result Comparison : gpu MATCHES cpu
Device vs Host Result Comparison : gpu MATCHES cpu
Result Summary:
Avg GPU Elapsed Time for 2-Queues = 0.00000 s
Avg Host Elapsed Time = 0.00000 s
Avg GPU Elapsed Time for 1-Queue = 0.00000 s
Avg Host Elapsed Time = 0.00000 s
Measured and (Acceptable) Avg Overlap = 0.0 % (18.0 %) -> Retry 1 more time(s)...
Device vs Host Result Comparison : gpu MATCHES cpu
Device vs Host Result Comparison : gpu MATCHES cpu
Result Summary:
Avg GPU Elapsed Time for 2-Queues = 0.00000 s
Avg Host Elapsed Time = 0.00000 s
Avg GPU Elapsed Time for 1-Queue = 0.00000 s
Avg Host Elapsed Time = 0.00000 s
Measured and (Acceptable) Avg Overlap = 0.0 % (18.0 %) -> Measured Overlap is NOT Acceptable
oclCopyComputeOverlap-Avg, Throughput = 0.0000 OverlapPercent, Time = 0.00000 s, Size = 10240000 Elements, NumDevsUsed = 1, Workgroup = 256
Starting Cleanup...
但我的计算时间这里总是无法正确计算?之前学过的所有例子里这种时间计算方式计算出来总是0,所以我自己时不用这种时间计算方式的!
乌龙:
本来应该是这样 然后我写成了下面这样:居然编译通过
即使加上void 后运行结果也不正确 原因在cl_int作为索引!!!!!!要用int 而不是cl_int 不然这个就是实质是没有执行 但运行也不会报错!而是结果就是错的!!!找了半天!