AMD-SDK的学习[6]---BlackScholes与BlackScholesDP

最新推荐文章于 2024-03-16 18:07:20 发布

元气少女缘结神

最新推荐文章于 2024-03-16 18:07:20 发布

阅读量553

点赞数

分类专栏： Improve Performance 文章标签： OpenCL

本文链接：https://blog.csdn.net/wd1603926823/article/details/78435298

版权

Improve Performance 专栏收录该内容

39 篇文章 9 订阅

订阅专栏

这个是关于什么期货公式的，因为我看了下没有什么依赖性，的确可以改成并行。AMD还真是兴趣广泛，无论是学术上的知识还是商业上的公式都搜罗来写成OpenCL形式，不错。

依旧改成我习惯看的形式：

#include <CL/cl.h>
#include <CL/cl_ext.h>
#include <stdlib.h>
#include <string.h>
#include <malloc.h>
#include <stdio.h>
#include "n_needed_headers/oclUtils.h"
#include "a_needed_headers/SDKCommon.hpp"
using namespace std;
#define GROUP_SIZE 256
//  Constants
#define S_LOWER_LIMIT 10.0f
#define S_UPPER_LIMIT 100.0f
#define K_LOWER_LIMIT 10.0f
#define K_UPPER_LIMIT 100.0f
#define T_LOWER_LIMIT 1.0f
#define T_UPPER_LIMIT 10.0f
#define R_LOWER_LIMIT 0.01f
#define R_UPPER_LIMIT 0.05f
#define SIGMA_LOWER_LIMIT 0.01f
#define SIGMA_UPPER_LIMIT 0.10f
float phi(float X);

int main()
{
	//set up OpenCL...
	cl_uint platformNum;
	cl_int status;
	status=clGetPlatformIDs(0,NULL,&platformNum);
	if(status!=CL_SUCCESS){
		printf("cannot get platforms number.\n");
		return -1;
	}
	cl_platform_id* platforms;
	platforms=(cl_platform_id*)alloca(sizeof(cl_platform_id)*platformNum);
	status=clGetPlatformIDs(platformNum,platforms,NULL);
	if(status!=CL_SUCCESS){
		printf("cannot get platforms addresses.\n");
		return -1;
	}
	cl_platform_id platformInUse=platforms[0];
	cl_device_id device;
	status=clGetDeviceIDs(platformInUse,CL_DEVICE_TYPE_DEFAULT,1,&device,NULL);
	cl_context context=clCreateContext(NULL,1,&device,NULL,NULL,&status);
	cl_command_queue_properties prop=0; //CL_QUEUE_PROFILING_ENABLE;
	cl_command_queue_properties *propers;
	propers=∝
	cl_command_queue commandQueue=clCreateCommandQueueWithProperties(context,device,propers, &status);
	std::ifstream srcFile("/home/jumper/OpenCL_projects/AMD-Sample-BlackScholes/BlackScholes_Kernels.cl");
	std::string srcProg(std::istreambuf_iterator<char>(srcFile),(std::istreambuf_iterator<char>()));
	const char * src = srcProg.c_str();
	size_t srclength = srcProg.length();
	cl_program program=clCreateProgramWithSource(context,1,&src,&srclength,&status);
	status=clBuildProgram(program,1,&device,NULL,NULL,&status);
	if (status != CL_SUCCESS)
	 {
		 cout<<"error:Build BasicDebug_Kernel()..."<<endl;
		 shrLogEx(LOGBOTH | ERRORMSG, status, STDERROR);
		 oclLogBuildInfo(program, oclGetFirstDev(context));
		 oclLogPtx(program, oclGetFirstDev(context), "ocldiary.ptx");
		 return(EXIT_FAILURE);
	 }

	//prepare host data
	cl_int samples(256 * 256 * 4);
	size_t blockSizeX(1), blockSizeY(1);
	cl_int width = 64,height = 64;
	// Calculate width and height from samples
	samples = samples / 4;
	samples = (samples / GROUP_SIZE)? (samples / GROUP_SIZE) * GROUP_SIZE:GROUP_SIZE;
	unsigned int tempVar1 = (unsigned int)sqrt((double)samples);
	tempVar1 = (tempVar1 / GROUP_SIZE)? (tempVar1 / GROUP_SIZE) * GROUP_SIZE:GROUP_SIZE;
	samples = tempVar1 * tempVar1;
	width = tempVar1;
	height = width;
	cl_float* randArray = (cl_float*)memalign(16, width * height * sizeof(cl_float4));//can exchange between cl_float4 and float freely....
	for(int i = 0; i < width * height * 4; i++)
	{
		randArray[i] = (float)rand() / (float)RAND_MAX;
	}
	cl_float* deviceCallPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
	memset(deviceCallPrice, 0, width * height * sizeof(cl_float4));
	cl_float* devicePutPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
	memset(devicePutPrice, 0, width * height * sizeof(cl_float4));

	//prepare gpu data
	cl_mem randBuf = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_USE_PERSISTENT_MEM_AMD,sizeof(cl_float4) * width  * height,NULL, &status);
	CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuf)");
	cl_mem callPriceBuf = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_float4) * width * height,NULL,&status);
	CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (callPriceBuf)");
	cl_mem putPriceBuf = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_float4) * width * height,NULL, &status);
	CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (putPriceBuf)");

	size_t globalThreads[2] = {width, height};
	size_t localThreads[2] = {blockSizeX, blockSizeY};
	cl_kernel kernel;
	bool useScalarKernel=false; //or true...
	if (useScalarKernel)
	{
		kernel = clCreateKernel(program, "blackScholes_scalar", &status);
		globalThreads[0]*=4;
	}
	else
	{
		kernel = clCreateKernel(program, "blackScholes", &status);
	}
	CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(kernel)");


	cl_event inMapEvt;
	void* mapPtr = clEnqueueMapBuffer(commandQueue,randBuf,CL_FALSE,CL_MAP_WRITE,0, sizeof(cl_float4) * width  * height,0,NULL,&inMapEvt,&status);
	CHECK_OPENCL_ERROR(status, "clEnqueueMapBuffer failed. (mapPtr)");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush(commandQueue) failed.");
	status=clWaitForEvents(1,&inMapEvt);
	status = clReleaseEvent(inMapEvt);
	CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(inMapEvt) Failed");
	memcpy(mapPtr, randArray, sizeof(cl_float4) * width  * height);

	cl_event unmapEvent;
	status = clEnqueueUnmapMemObject(commandQueue,randBuf,mapPtr,0,NULL, &unmapEvent);
	CHECK_OPENCL_ERROR(status, "clEnqueueUnmapMemObject failed. (randBuf)");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush failed. (randBuf)");
	status=clWaitForEvents(1,&unmapEvent);
	status = clReleaseEvent(unmapEvent);
	CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(unmapEvent) Failed");

	// whether sort is to be in increasing order. CL_TRUE implies increasing
	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&randBuf);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (randBuf)");
	int rowStride = useScalarKernel?width*4:width;
	status = clSetKernelArg(kernel, 1, sizeof(rowStride), (const void *)&rowStride);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (width)");
	status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&callPriceBuf);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (callPriceBuf)");
	status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&putPriceBuf);
	CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (putPriceBuf)");
	// Enqueue a kernel run call.
	status = clEnqueueNDRangeKernel(commandQueue,kernel,2,NULL,globalThreads,localThreads,0,NULL,NULL);
	CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.");
	// wait for the kernel call to finish execution
	status = clFinish(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFinish failed.");

	cl_event callEvent;
	cl_event putEvent;
	// Enqueue the results to application pointer
	status = clEnqueueReadBuffer(commandQueue,callPriceBuf,CL_FALSE,0, width * height * sizeof(cl_float4), deviceCallPrice,0,NULL,&callEvent);
	CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");
	// Enqueue the results to application pointer
	status = clEnqueueReadBuffer(commandQueue,putPriceBuf,CL_FALSE,0,width * height * sizeof(cl_float4),devicePutPrice,0, NULL,&putEvent);
	CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed.");
	status = clFlush(commandQueue);
	CHECK_OPENCL_ERROR(status, "clFlush(commanQueue) failed.");
	status=clWaitForEvents(1,&callEvent);
	status = clReleaseEvent(callEvent);
	CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(callEvent) Failed");
	status=clWaitForEvents(1,&putEvent);
	status = clReleaseEvent(putEvent);
	CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(putEvent) Failed");

	/CPU result....
	cl_float* hostCallPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
	memset(hostCallPrice, 0, width * height * sizeof(cl_float4));
	cl_float* hostPutPrice = (cl_float*)malloc(width * height * sizeof(cl_float4));
	memset(hostPutPrice, 0, width * height * sizeof(cl_float4));
	int y;
	for (y = 0; y < width * height * 4; ++y)
	{
		float d1, d2;
		float sigmaSqrtT;
		float KexpMinusRT;
		float s = S_LOWER_LIMIT * randArray[y] + S_UPPER_LIMIT * (1.0f - randArray[y]);
		float k = K_LOWER_LIMIT * randArray[y] + K_UPPER_LIMIT * (1.0f - randArray[y]);
		float t = T_LOWER_LIMIT * randArray[y] + T_UPPER_LIMIT * (1.0f - randArray[y]);
		float r = R_LOWER_LIMIT * randArray[y] + R_UPPER_LIMIT * (1.0f - randArray[y]);
		float sigma = SIGMA_LOWER_LIMIT * randArray[y] + SIGMA_UPPER_LIMIT *(1.0f - randArray[y]);
		sigmaSqrtT = sigma * sqrt(t);
		d1 = (log(s / k) + (r + sigma * sigma / 2.0f) * t) / sigmaSqrtT;
		d2 = d1 - sigmaSqrtT;
		KexpMinusRT = k * exp(-r * t);
		hostCallPrice[y] = s * phi(d1) - KexpMinusRT * phi(d2);
		hostPutPrice[y]  = KexpMinusRT * phi(-d2) - s * phi(-d1);
	}
	//print CPU & GPU result...
	for (y = 0; y < width * height * 4; ++y)
	{
			printf("CallPrice:CPU-result---%f  GPU-result---%f  PutPrice:CPU-result---%f  GPU-result---%f\n",
					hostCallPrice[y],deviceCallPrice[y],hostPutPrice[y],devicePutPrice[y]);
	}

	status = clReleaseKernel(kernel);
	CHECK_OPENCL_ERROR(status, "clReleaseKernel failed.");
	status = clReleaseProgram(program);
	CHECK_OPENCL_ERROR(status, "clReleaseProgram failed.");
	status = clReleaseMemObject(randBuf);
	CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");
	status = clReleaseMemObject(callPriceBuf);
	CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");
	status = clReleaseMemObject(putPriceBuf);
	CHECK_OPENCL_ERROR(status, "clReleaseMemObject failed.");
	status = clReleaseCommandQueue(commandQueue);
	CHECK_OPENCL_ERROR(status, "clReleaseCommandQueue failed.");
	status = clReleaseContext(context);
	CHECK_OPENCL_ERROR(status, "clReleaseContext failed.");
	status=clReleaseDevice(device);
	FREE(randArray);
	FREE(deviceCallPrice);
	FREE(devicePutPrice);
	FREE(hostCallPrice);
	FREE(hostPutPrice);

	return 0;
}

float phi(float X)
{
    float y, absX, t;
    // the coeffs
    const float c1 =  0.319381530f;
    const float c2 = -0.356563782f;
    const float c3 =  1.781477937f;
    const float c4 = -1.821255978f;
    const float c5 =  1.330274429f;
    const float oneBySqrt2pi = 0.398942280f;
    absX = fabs(X);
    t = 1.0f / (1.0f + 0.2316419f * absX);
    y = 1.0f - oneBySqrt2pi * exp(-X * X / 2.0f) *t * (c1 + t * (c2 +t * (c3 +t * (c4 + t * c5))));
    return (X < 0) ? (1.0f - y) : y;
}

cl部分我没有改动：

#define S_LOWER_LIMIT 10.0f
#define S_UPPER_LIMIT 100.0f
#define K_LOWER_LIMIT 10.0f
#define K_UPPER_LIMIT 100.0f
#define T_LOWER_LIMIT 1.0f
#define T_UPPER_LIMIT 10.0f
#define R_LOWER_LIMIT 0.01f
#define R_UPPER_LIMIT 0.05f
#define SIGMA_LOWER_LIMIT 0.01f
#define SIGMA_UPPER_LIMIT 0.10f

/**
 * @brief   Abromowitz Stegun approxmimation for PHI (Cumulative Normal Distribution Function)
 * @param   X input value
 * @param   phi pointer to store calculated CND of X
 */
void phi(float4 X, float4* phi)
{
    float4 y;
    float4 absX;
    float4 t;
    float4 result;

    const float4 c1 = (float4)0.319381530f;
    const float4 c2 = (float4)-0.356563782f;
    const float4 c3 = (float4)1.781477937f;
    const float4 c4 = (float4)-1.821255978f;
    const float4 c5 = (float4)1.330274429f;
    const float4 zero = (float4)0.0f;
    const float4 one = (float4)1.0f;
    const float4 two = (float4)2.0f;
    const float4 temp4 = (float4)0.2316419f;
    const float4 oneBySqrt2pi = (float4)0.398942280f;
    absX = fabs(X);
    t = one/(one + temp4 * absX);
    y = one - oneBySqrt2pi * exp(-X*X/two) * t* (c1 + t* (c2 + t* (c3 + t* (c4 + t * c5))));
    result = (X < zero)? (one - y) : y;

    *phi = result;
}

/*
 * @brief   Calculates the call and put prices by using Black Scholes model
 * @param   s       Array of random values of current option price
 * @param   sigma   Array of random values sigma
 * @param   k       Array of random values strike price
 * @param   t       Array of random values of expiration time
 * @param   r       Array of random values of risk free interest rate
 * @param   width   Width of call price or put price array
 * @param   call    Array of calculated call price values
 * @param   put     Array of calculated put price values
 */
__kernel 
void
blackScholes(const __global float4 *randArray,
             int width,
             __global float4 *call,
             __global float4 *put)
{
    float4 d1, d2;
    float4 phiD1, phiD2;
    float4 sigmaSqrtT;
    float4 KexpMinusRT;
    
    size_t xPos = get_global_id(0);
    size_t yPos = get_global_id(1);
    float4 two = (float4)2.0f;
    float4 inRand = randArray[yPos * width + xPos];
    float4 S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
    float4 K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
    float4 T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
    float4 R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
    float4 sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);

    sigmaSqrtT = sigmaVal * sqrt(T);

    d1 = (log(S/K) + (R + sigmaVal * sigmaVal / two)* T)/ sigmaSqrtT;
    d2 = d1 - sigmaSqrtT;

    KexpMinusRT = K * exp(-R * T);
    phi(d1, &phiD1);
    phi(d2, &phiD2);
    call[yPos * width + xPos] = S * phiD1 - KexpMinusRT * phiD2;
    phi(-d1, &phiD1);
    phi(-d2, &phiD2);
    put[yPos * width + xPos]  = KexpMinusRT * phiD2 - S * phiD1;
}

void phi_scalar(float X, float* phi)
{
    float y;
    float absX;
    float t;
    float result;

    const float c1 = (float)0.319381530f;
    const float c2 = (float)-0.356563782f;
    const float c3 = (float)1.781477937f;
    const float c4 = (float)-1.821255978f;
    const float c5 = (float)1.330274429f;
    const float zero = (float)0.0f;
    const float one = (float)1.0f;
    const float two = (float)2.0f;
    const float temp4 = (float)0.2316419f;
    const float oneBySqrt2pi = (float)0.398942280f;
    absX = fabs(X);
    t = one/(one + temp4 * absX);
    y = one - oneBySqrt2pi * exp(-X*X/two) * t * (c1 + t* (c2 + t* (c3 + t* (c4 + t * c5))));
    result = (X < zero)? (one - y) : y;

    *phi = result;
}

/*
 * @brief   Calculates the call and put prices by using Black Scholes model
 * @param   s       Array of random values of current option price
 * @param   sigma   Array of random values sigma
 * @param   k       Array of random values strike price
 * @param   t       Array of random values of expiration time
 * @param   r       Array of random values of risk free interest rate
 * @param   width   Width of call price or put price array
 * @param   call    Array of calculated call price values
 * @param   put     Array of calculated put price values
 */
__kernel 
void
blackScholes_scalar(const __global float *randArray,
             int width,
             __global float *call,
             __global float *put)
{
    float d1, d2;
    float phiD1, phiD2;
    float sigmaSqrtT;
    float KexpMinusRT;
    
    size_t xPos = get_global_id(0);
    size_t yPos = get_global_id(1);
    float two = (float)2.0f;
    float inRand = randArray[yPos * width + xPos];
    float S = S_LOWER_LIMIT * inRand + S_UPPER_LIMIT * (1.0f - inRand);
    float K = K_LOWER_LIMIT * inRand + K_UPPER_LIMIT * (1.0f - inRand);
    float T = T_LOWER_LIMIT * inRand + T_UPPER_LIMIT * (1.0f - inRand);
    float R = R_LOWER_LIMIT * inRand + R_UPPER_LIMIT * (1.0f - inRand);
    float sigmaVal = SIGMA_LOWER_LIMIT * inRand + SIGMA_UPPER_LIMIT * (1.0f - inRand);

    sigmaSqrtT = sigmaVal * sqrt(T);

    d1 = (log(S/K) + (R + sigmaVal * sigmaVal / two)* T)/ sigmaSqrtT;
    d2 = d1 - sigmaSqrtT;

    KexpMinusRT = K * exp(-R * T);
    phi_scalar(d1, &phiD1);
    phi_scalar(d2, &phiD2);
    call[yPos * width + xPos] = S * phiD1 - KexpMinusRT * phiD2;
    phi_scalar(-d1, &phiD1);
    phi_scalar(-d2, &phiD2);
    put[yPos * width + xPos]  = KexpMinusRT * phiD2 - S * phiD1;
}

两种情况都运行了一下，CPU与GPU结果都是一致。至于这种期货什么的公式我不想说什么，这两个kernel的差别就是一个用矢量float4；一个用标量float；我用CodeXL看了事实证明同样条件下，矢量更快！！！

看标量的kernel运行时间是12.3526ms；但矢量的只需要10.9099ms！！！我习惯用标量的习惯看来得改过来。。。
这个例子很简单，不多说什么。

我需要学习的部分：

1、依旧是使用memalign()分配内存，可以在矢量和对应标量之间随意切换，并且是对齐的！

2、CL_MEM_USE_PERSISTENT_MEM_AMD的巨大优点我依旧还不太明白，因为显然还是需要那么大的randArray（host端）和randBuf（gpu端）啊？

于是我按照例子写上CL_MEM_USE_PERSISTENT_MEM_AMD 和去掉CL_MEM_USE_PERSISTENT_MEM_AMD 进行了一次对比：

看这是例子中有CL_MEM_USE_PERSISTENT_MEM_AMD 标志的，DataTransfer这一栏的紫色很少，而且Device Time没有；而下面当我去掉CL_MEM_USE_PERSISTENT_MEM_AMD标志时，Data Transfer的紫色多了，而且出现了Device Time为0.6416ms！！！

这就是这个flag的优点吧：即GPU上没有真正发生dataTransfer ？？？！

大神说：依然会传输的, 但如同zero-copy, 内存后备的buffer, 总是能让GPU的指令执行和回传overlap一样,此种显存, 能让你的CPU在执行指令的时候同时传输.这样前者分别能进行的同时计算和回传并行 + 同时减少一次无辜的显存写入和一次无辜的显存读取; 后者能让CPU的运算和正向传输并行 + 同时减少内存的写入和读取,所以这两个会快.
需要说明的是, 前者的第二部分因素往往被人无视, 而只说前者的第一部分(GPU的指令运算和回传并行overlap), 这是因为前者的第二部分往往不会造成瓶颈(相比PCI-E和内存, 显存吞吐率太高了) 但在CPU, 你应当同时考虑这两个因素.

3、注意到例子将数据传回host显示是用的ReadBuffer（肯定有device Time）而不是MapBuffer，我以为MapBuffer不会有device Time，于是传回时我也改成MapBuffer，时间竟然比ReadBuffer多很多？！看来之前memory flag的那9个flags的区别联系我还没真正弄懂？！

****************************************************************************************************

至于工程BlackScholesDP与上面类似，只是将float换成了double而已，但差别大大的：

57ms！！！当然毕竟是2MB咯...