IMX8M系列 OpenCL FFT 示例编译测试及其他demo测试(MYD-JX8MX)
上一篇文章已经将如何编译镜像,如何用官方的方式FslBuild.py 脚本编译demo。不知道有没有成功的朋友,如果你成功了,那么可以依然用这种方式,如果你没有成功,那么你可以参考接下来的方式来编译你的demo
导出文件系统
上一篇文章有描述如何导出文件系统,这里的目的是为了提供编译所需的头文件和库文件等
具体方法如下:
#进入源码root目录
cd fsl-release-yocto
#执行设置环境变量的脚本
. ./setup-environment build-xwayland
#编译工具链
bitbake meta-toolchain
#安装工具链
cd ~/fsl-release-yocto/build-xwayland/tmp/deploy/sdk
./fsl-imx-xwayland-glibc-x86_64-meta-toolchain-aarch64-toolchain-4.9.88-2.0.0.sh
./opt/fsl-imx-xwayland/4.19-warrior/environment-setup-aarch64-poky-linux
#提取文件系统
runqemu-extract-sdk ~/fsl-release-yocto/build-xwayland/tmp/deploy/images/imx8mqevk/fsl-image-qt5-validation-imx-imx8mqevk-20210809020904.rootfs.tar.bz2 ~/imx8mqevk-rootfs
提取好的文件系统在~/imx8mqevk-rootfs 目录下
具体内容如下:
该文件系统时利用交叉编译方式编译出来的,有我们所需的头文件和库等。
编译FFT
这里我利用gtec-demo-frameworkNXP提供的示例来单独编译FFT用于简单的测试其GPU的性能
main.cpp
#include "clutil.h"
int
main(
const int argc,
const char* argv[]
)
{
if (argc < 2)
{
printf("Usage: %s fftlen \n", argv[0]);
return -1;
}
const unsigned len = atoi(argv[1]);
if (len > FFT_MAX)
{
printf("FFT length cannot be greater than %d.\n", FFT_MAX);
return -1;
}
if (len < 16)
{
printf("FFT length has to at least be 16.\n");
return -1;
}
if ((len != 1) && (len & (len - 1)))
{
printf("FFT length (%d) must be a power-of-2.\n", len);
return -1;
}
printf("Block size: %d \n", blockSize);
printf("Print result: %s \n", print ? "yes" : "no");
int result = 0;
result = runFFT(len);
if (result == 0)
{
printf("Successful.\n");
if (print) printResult(len);
}
else
{
printf("Failed.\n");
}
cleanup();
}
fft.cpp
#include "clutil.h"
#include <time.h>
static unsigned workOffset;
static unsigned workSize;
static int p[FFT_MAX_LOG2N] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768};
static int twop[FFT_MAX_LOG2N] = {2*1, 2*2, 2*4, 2*8, 2*16, 2*32, 2*64, 2*128, 2*256, 2*512, 2*1024, 2*2048, 2*4096, 2*8192, 2*16384, 2*32768};
static int threep[FFT_MAX_LOG2N] = {3*1, 3*2 ,3*4, 3*8, 3*16, 3*32, 3*64, 3*128, 3*256, 3*512, 3*1024, 3*2048, 3*4096, 3*8192, 3*16384, 3*32768};
static int pminus1[FFT_MAX_LOG2N] = {1-1, 2-1, 4-1, 8-1, 16-1, 32-1, 64-1, 128-1, 256-1, 512-1, 1024-1, 2048-1, 4096-1, 8192-1, 16384-1, 32768-1};
#ifndef M_PI
#define M_PI 3.14159265358979f
#endif
static cl_float minusPIoverp[FFT_MAX_LOG2N] = { -M_PI, -M_PI/2.f, -M_PI/4.f, -M_PI/ 8.f, -M_PI/16.f, -M_PI/32.f, -M_PI/ 64.f, -M_PI/128.f, -M_PI/256.f, -M_PI/ 512.f, -M_PI/1024.f, -M_PI/2048.f, -M_PI/4096.f, -M_PI/ 8192.f, -M_PI/16384.f, -M_PI/32768.f};
static cl_float minusPIover2p[FFT_MAX_LOG2N] = { -M_PI/2.f, -M_PI/4.f, -M_PI/8.f, -M_PI/16.f, -M_PI/32.f, -M_PI/64.f, -M_PI/128.f, -M_PI/256.f, -M_PI/512.f, -M_PI/1024.f, -M_PI/2048.f, -M_PI/4096.f, -M_PI/8192.f, -M_PI/16384.f, -M_PI/32768.f, -M_PI/65536.f};
static cl_float minusPIover2p_2x[FFT_MAX_LOG2N] = {-2.f*M_PI/2.f, -2.f*M_PI/4.f, -2.f*M_PI/8.f, -2.f*M_PI/16.f, -2.f*M_PI/32.f, -2.f*M_PI/64.f, -2.f*M_PI/128.f, -2.f*M_PI/256.f, -2.f*M_PI/512.f, -2.f*M_PI/1024.f, -2.f*M_PI/2048.f, -2.f*M_PI/4096.f, -2.f*M_PI/8192.f, -2.f*M_PI/16384.f, -2.f*M_PI/32768.f, -2.f*M_PI/65536.f};
static cl_float minusPIover2p_3x[FFT_MAX_LOG2N] = {-3.f*M_PI/2.f, -3.f*M_PI/4.f, -3.f*M_PI/8.f, -3.f*M_PI/16.f, -3.f*M_PI/32.f, -3.f*M_PI/64.f, -3.f*M_PI/128.f, -3.f*M_PI/256.f, -3.f*M_PI/512.f, -3.f*M_PI/1024.f, -3.f*M_PI/2048.f, -3.f*M_PI/4096.f, -3.f*M_PI/8192.f, -3.f*M_PI/16384.f, -3.f*M_PI/32768.f, -3.f*M_PI/65536.f};
static int
radix(
int N
)
{
int i = 0, j = 0;
for (; i <= 31; i++)
{
if ((N & (1 << i)) == 0)
{
j++;
}
else
{
break;
}
}
return (0 == (j%2)) ? 4 : 2;
}
static unsigned int
log2NFFT(
unsigned int size
)
{
unsigned int v = size;
unsigned int log2n = 0;
while (v >>= 1)
{
log2n++;
}
return log2n;
}
#define RADIX2_FFT_KERNEL "fft_radix2"
#define RADIX4_FFT_KERNEL "fft_radix4"
static void
FFTGpu(
const unsigned len
)
{
if (len == 0)
{
return;
}
// figure out if we can use a radix-4 FFT : otherwise radix-2
int rad = radix(len);
if (4==rad && ((16==len) || (256==len) || (4096==len) || (65536==len) || (1048576 == len) ))
rad = 2;
// log2(n) is the # of kernels that will be invoked (for a radix-2 FFT)
unsigned int log2n = log2NFFT(len);
printf("log2(fft size) = log2(%d)=%d\n", len, log2n);
printf("Compiling radix-%d FFT Program for GPU...\n", rad);
compileProgram("fft.cl");
printf("creating radix-%d kernels...\n", rad);
if (2 == rad)
{
for (unsigned kk = 0; kk < log2n; kk++)
{
printf("Creating kernel %s %d (p=%d)...\n", RADIX2_FFT_KERNEL, kk, p[kk]);
createFFTKernel(RADIX2_FFT_KERNEL, kk);
}
}
else
{ // radix-4
for (unsigned kk = 0; kk < log2n; kk+=2)
{
printf("Creating kernel %s %d...\n", RADIX4_FFT_KERNEL, kk>>1);
createFFTKernel(RADIX4_FFT_KERNEL, kk>>1);
}
}
workSize = len;
allocateDeviceMemory(workSize, workOffset);
if (2 == rad)
{
// FFT kernel invoked for p=1, p=2, ..., p=n/2
// input and output swapped each time
for (unsigned kk = 0; kk < log2n; kk++)
{
void *in = (0 == (kk&1)) ? &d_intime : &d_outfft;
void *out = (0 == (kk&1)) ? &d_outfft : &d_intime;
printf("Setting kernel args for kernel %d (p=%d)...\n", kk, p[kk]);
clSetKernelArg(kernels[kk], 0, sizeof(cl_mem), in);
clSetKernelArg(kernels[kk], 1, sizeof(cl_mem), out);
clSetKernelArg(kernels[kk], 2, sizeof(unsigned), &p[kk]);
clSetKernelArg(kernels[kk], 3, sizeof(unsigned), &pminus1[kk]);
clSetKernelArg(kernels[kk], 4, sizeof(cl_float), &minusPIoverp[kk]);
} // end (for 1,2,4,8,...N/2)
}
else
{
// radix-4, FFT kernel invoked for p=1, p=4, ..., p=n/4
for (unsigned kk = 0; kk < log2n; kk+=2)
{
int idx = kk>>1;
void *in = (0 == (idx&1)) ? &d_intime : &d_outfft;
void *out = (0 == (idx&1)) ? &d_outfft : &d_intime;
printf("Setting kernel args for kernel %d (p=%d)...\n", idx, p[kk]);
clSetKernelArg(kernels[idx], 0, sizeof(cl_mem), in);
clSetKernelArg(kernels[idx], 1, sizeof(cl_mem), out);
clSetKernelArg(kernels[idx], 2, sizeof(unsigned), &p[kk]);
clSetKernelArg(kernels[idx], 3, sizeof(unsigned), &pminus1[kk]);
clSetKernelArg(kernels[idx], 4, sizeof(unsigned), &twop[kk]);
clSetKernelArg(kernels[idx], 5, sizeof(unsigned), &threep[kk]);
clSetKernelArg(kernels[idx], 6, sizeof(cl_float), &minusPIover2p[kk]);
clSetKernelArg(kernels[idx], 7, sizeof(cl_float), &minusPIover2p_2x[kk]);
clSetKernelArg(kernels[idx], 8, sizeof(cl_float), &minusPIover2p_3x[kk]);
} // end (for 1,4,16,...,N/4)
} // end (if radix-2 or radix-4)
size_t globalWorkSize[] = { (2==rad) ? (1<<(log2n-1)) : (len>>2) };
size_t localWorkSize[] = { (blockSize <= globalWorkSize[0]) ? blockSize : globalWorkSize[0] };
cl_int ciErrNum = 0;
cl_mem d_result;
clock_t start,end1,end2,end3;
start = clock();
Cl_finish();
if (2==rad)
{
for (unsigned kk = 0; kk < log2n; kk++)
{
// note to self: up to 8 it works, beyond that it does not
printf("running kernel %d (p=%d)...\n", kk, p[kk]);
runKernelFFT(localWorkSize, globalWorkSize, kk);
d_result = (0 == (kk&1)) ? d_outfft : d_intime;
}
}
else
{
// radix-4
for (unsigned kk = 0; kk < log2n; kk+=2)
{
int idx = kk>>1;
printf("running kernel %d (p=%d)...\n", idx, p[kk]);
runKernelFFT(localWorkSize, globalWorkSize, idx);
d_result = (0 == (kk&1)) ? d_outfft : d_intime;
}
}
Cl_finish();
end1 = clock();
//printf("time_1: %f s\n",double(end-start)/CLOCKS_PER_SEC);
copyFromDevice(d_result, h_outfft + workOffset, 2*workSize);
end2 = clock();
//printf("time_2: %f s\n",double(end-start)/CLOCKS_PER_SEC);
printGpuTime((2==rad)?log2n:(log2n>>1));
end3 = clock();
printf("time_1: %f s\ntime_2: %f s\ntime_3: %f s\n",\
double(end1-start)/CLOCKS_PER_SEC,\
double(end2-start)/CLOCKS_PER_SEC,\
double(end3-start)/CLOCKS_PER_SEC);
}
int
runFFT(
const unsigned len
)
{
cl_int err;
err = initExecution(len);
if (err)
{
return err;
}
FFTGpu(len);
return 0;
}
clutil.cpp
#include "clutil.h"
#ifdef UNDER_CE
#include <windows.h>
#endif
// global variables
cl_context cxContext = 0;
cl_program cpProgram = 0;
cl_device_id cdDeviceID[2];
cl_kernel kernels[FFT_MAX_LOG2N];
cl_command_queue commandQueue;
cl_event gpuExecution[FFT_MAX_LOG2N];
#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
// default configs
unsigned blockSize = 16;
unsigned print = 1;
// h_Freal and h_Fimag represent the input signal to be transformed.
// h_Rreal and h_Rimag represent the transformed output.
float* h_Freal = 0;
float* h_Fimag = 0;
float* h_Rreal = 0;
float* h_Rimag = 0;
// real & imag interleaved
float* h_intime = 0; // time-domain input samples
float* h_outfft = 0; // freq-domain output samples
// d_Freal and d_Fimag represent the input signal to be transformed.
// d_Rreal and d_Rimag represent the transformed output.
cl_mem d_Freal;
cl_mem d_Fimag;
cl_mem d_Rreal;
cl_mem d_Rimag;
// real & imag interleaved
cl_mem d_intime; // time-domain input samples
cl_mem d_outfft; // freq-domain output samples
int
initExecution(
const unsigned len
)
{
// Allocate host memory (and initialize input signal)
allocateHostMemory(len);
printf("Initializing device(s)...\n");
// create the OpenCL context on available GPU devices
init_cl_context(CL_DEVICE_TYPE_GPU);
const cl_uint ciDeviceCount = getDeviceCount();
printf("ciDeviceCount:%d \n",ciDeviceCount);
if (!ciDeviceCount)
{
printf("No opencl specific devices!\n");
return -1;
}
const cl_uint ciComputeUnitsCount = getNumComputeUnits();
printf("# compute units = %d\n", ciComputeUnitsCount);
printf("Creating Command Queue...\n");
// create a command queue on device 0
createCommandQueue();
return 0;
}
void
printGpuTime(
const unsigned int kernelCount
)
{
double t, total = 0;
for (unsigned k = 0; k<kernelCount; ++k)
{
t = executionTime(gpuExecution[k]);
printf("Kernel execution time on GPU (kernel %d) : %10.6f seconds\n", k, t);
total += t;
}
printf("Total Kernel execution time on GPU : %10.6f seconds\n",total);
}
void
printResult(
const unsigned size
)
{
FILE *fp;
#ifdef UNDER_CE
wchar_t moduleName[MAX_PATH];
char path[MAX_PATH], * p;
GetModuleFileName(NULL, moduleName, MAX_PATH);
wcstombs(path, moduleName, MAX_PATH);
p = strrchr(path, '\\');
strcpy(p + 1, "fft_output.csv");
fp = fopen(path, "w+");
#else
fp = fopen("fft_output.csv", "w+");
#endif
if (fp == NULL) return;
for (unsigned i = 0; i < size; ++i)
{
fprintf(fp, "%f,%f\n", h_outfft[2*i], h_outfft[2*i+1]);
}
fclose(fp);
}
double
executionTime(
const cl_event event
)
{
cl_ulong start, end;
cl_int err;
err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
err |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
if (err)
{
return 0;
}
printf("start:%llu, end:%llu \n", start, end);
return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds
}
void
allocateHostMemory(
const unsigned len
)
{
h_Freal = (float *) malloc(sizeof(float) * len);
checkError((h_Freal != NULL), CL_TRUE, "Could not allocate memory");
h_Fimag = (float *) malloc(sizeof(float) * len);
checkError((h_Fimag != NULL), CL_TRUE, "Could not allocate memory");
h_Rreal = (float *) malloc(sizeof(float) * len);
checkError((h_Rreal != NULL), CL_TRUE, "Could not allocate memory");
h_Rimag = (float *) malloc(sizeof(float) * len);
checkError((h_Rimag != NULL), CL_TRUE, "Could not allocate memory");
// real/imag interleaved input time-domain samples
h_intime = (float *) malloc(sizeof(float) * len * 2);
checkError((h_intime != NULL), CL_TRUE, "Could not allocate memory");
// real/imag interleaved output FFT data
h_outfft = (float *) malloc(sizeof(float) * len * 2);
checkError((h_outfft != NULL), CL_TRUE, "Could not allocate memory");
const unsigned n = 16;
for (unsigned i = 0 ; i < len; ++i)
{
h_Freal[i] = (i + 1) % n;
h_Fimag[i] = (i + 1) % n;
h_intime[2*i] = h_intime[2*i+1] = (i + 1) % n;
h_Rreal[i] = 0;
h_Rimag[i] = 0;
h_outfft[2*i] = h_outfft[2*i+1] = 0;
}
if (print)
{
FILE *fp = NULL;
#ifdef UNDER_CE
wchar_t moduleName[MAX_PATH];
char path[MAX_PATH], * p;
GetModuleFileName(NULL, moduleName, MAX_PATH);
wcstombs(path, moduleName, MAX_PATH);
p = strrchr(path, '\\');
strcpy(p + 1, "fft_input.csv");
fp = fopen(path, "w+");
#else
fp = fopen("fft_input.csv", "w+");
#endif
if (fp == NULL) return;
for (unsigned int kk=0; kk<len; kk++)
{
fprintf(fp, "%f,%f\n", h_intime[2*kk], h_intime[2*kk+1]);
}
fclose(fp);
}
}
void
allocateDeviceMemory(
const unsigned size,
const unsigned copyOffset
)
{
d_Freal = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Freal + copyOffset);
copyToDevice(d_Freal, h_Freal + copyOffset, size);
d_Fimag = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Fimag + copyOffset);
copyToDevice(d_Fimag, h_Fimag + copyOffset, size);
// copy real/imag interleaved input data to device
d_intime = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_intime + copyOffset * 2);
copyFromDevice(d_intime, h_outfft, size * 2); // debug
d_Rreal = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rreal + copyOffset);
copyToDevice(d_Rreal, h_Rreal + copyOffset, size);
d_Rimag = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rimag + copyOffset);
copyToDevice(d_Rimag, h_Rimag + copyOffset, size);
// copy real/imag interleaved out FFT to device
d_outfft = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_outfft + copyOffset * 2);
copyToDevice(d_intime, h_outfft + copyOffset * 2, size * 2);
}
void
cleanup(
void
)
{
if (d_Freal) clReleaseMemObject(d_Freal);
if (d_Fimag) clReleaseMemObject(d_Fimag);
if (d_Rreal) clReleaseMemObject(d_Rreal);
if (d_Rimag) clReleaseMemObject(d_Rimag);
if (d_intime) clReleaseMemObject(d_intime);
if (d_outfft) clReleaseMemObject(d_outfft);
for (unsigned kk=0; kk<ARRAY_SIZE(kernels); kk++) {
if (gpuExecution[kk]) clReleaseEvent(gpuExecution[kk]);
}
if (commandQueue) clReleaseCommandQueue(commandQueue);
if (cpProgram) clReleaseProgram(cpProgram);
if (cxContext) clReleaseContext(cxContext);
free(h_Freal);
h_Freal = 0;
free(h_Fimag);
h_Fimag = 0;
free(h_Rreal);
h_Rreal = 0;
free(h_Rimag);
h_Rimag = 0;
free(h_intime);
h_intime = 0;
free(h_outfft);
h_outfft = 0;
}
void
checkError(
const cl_int ciErrNum,
const cl_int ref,
const char* const operation
)
{
if (ciErrNum != ref) {
printf("ERROR:: %d %s failed\n\n", ciErrNum, operation);
cleanup();
exit(EXIT_FAILURE);
}
}
void
init_cl_context(
const cl_device_type device_type
)
{
cl_int ciErrNum = CL_SUCCESS;
#ifndef WIN32
cxContext = clCreateContextFromType(0, /* cl_context_properties */
device_type,
NULL, /* error function ptr */
NULL, /* user data to be passed to err fn */
&ciErrNum);
checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#else
cl_platform_id cpPlatform;
ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);
checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");
cl_uint uiNumDevices;
ciErrNum = clGetDeviceIDs(cpPlatform, device_type, 0, NULL, &uiNumDevices);
checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");
cl_device_id cdDevices[20];
ciErrNum = clGetDeviceIDs(cpPlatform, device_type, uiNumDevices, cdDevices, NULL);
checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");
cl_uint targetDevice=0, uiNumDevsUsed=1;
cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum);
checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#endif
}
cl_uint
getDeviceCount(
void
)
{
size_t nDeviceBytes;
const cl_int ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
checkError(ciErrNum, CL_SUCCESS, "clGetContextInfo");
return ((cl_uint)nDeviceBytes/sizeof(cl_device_id));
}
cl_uint
getNumComputeUnits(
void
)
{
cl_platform_id cpPlatform;
cl_int ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);
checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");
//Get all the devices
printf("Get the Device info and select Device...\n");
cl_uint uiNumDevices;
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");
cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");
// Set target device and Query number of compute units on targetDevice
printf("# of Devices Available = %d\n", uiNumDevices);
cl_uint num_compute_units;
clGetDeviceInfo(cdDevices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL);
printf("# of Compute Units = %d\n", num_compute_units);
free(cdDevices);
return num_compute_units;
}
void
createCommandQueue(
void
)
{
cl_int ciErrNum = CL_SUCCESS;
ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*2, &cdDeviceID, NULL);
commandQueue = clCreateCommandQueue(cxContext, cdDeviceID[0], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
checkError(ciErrNum, CL_SUCCESS, "clCreateCommandQueue");
}
void
compileProgram(
const char* const kernel_file
)
{
size_t program_length;
FILE* pFileStream = NULL;
cl_int ciErrNum;
#ifdef _WIN32
#ifdef UNDER_CE
wchar_t moduleName[MAX_PATH];
char path[MAX_PATH], * p;
GetModuleFileName(NULL, moduleName, MAX_PATH);
wcstombs(path, moduleName, MAX_PATH);
p = strrchr(path, '\\');
strcpy(p + 1, kernel_file);
pFileStream = fopen(path, "rb");
if (pFileStream == NULL)
{
checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");
}
#else
if(fopen_s(&pFileStream, kernel_file, "rb") != 0)
{
checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");
}
#endif
#else
pFileStream = fopen(kernel_file, "rb");
if(pFileStream == 0)
{
checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");
}
#endif
// get the length of the source code
fseek(pFileStream, 0, SEEK_END);
program_length = ftell(pFileStream);
fseek(pFileStream, 0, SEEK_SET);
// allocate a buffer for the source code string and read it in
char* source = (char *)malloc(program_length + 1);
if (fread((source), program_length, 1, pFileStream) != 1)
{
fclose(pFileStream);
free(source);
checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on read source");
}
fclose(pFileStream);
source[program_length] = '\0';
// Create the program for all GPUs in the context
cpProgram = clCreateProgramWithSource( cxContext, 1, (const char **) &source, &program_length, &ciErrNum);
free(source);
checkError(ciErrNum, CL_SUCCESS, "clCreateProgramWithSource");
ciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);
if (ciErrNum != CL_SUCCESS)
{
char cBuildLog[10240];
clGetProgramBuildInfo(cpProgram, cdDeviceID[0], CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL );
printf("\nBuild Log : \n%s\n", cBuildLog);
checkError(ciErrNum, CL_SUCCESS, "clBuildProgram");
}
}
void
createFFTKernel(
const char* const kernelName,
int kk
)
{
cl_int ciErrNum = CL_SUCCESS;
kernels[kk] = clCreateKernel(cpProgram, kernelName, &ciErrNum);
checkError(ciErrNum, CL_SUCCESS, "clCreateKernel");
}
cl_mem
createDeviceBuffer(
const cl_mem_flags flags,
const size_t size,
void* const hostPtr
)
{
cl_int ciErrNum = CL_SUCCESS;
const cl_mem d_mem = clCreateBuffer(cxContext, flags | CL_MEM_COPY_HOST_PTR, size, hostPtr, &ciErrNum);
checkError(ciErrNum, CL_SUCCESS, "clCreateBuffer");
return d_mem;
}
void
copyToDevice(
const cl_mem mem,
float* const hostPtr,
const unsigned size
)
{
const cl_int ciErrNum = clEnqueueWriteBuffer(commandQueue, mem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);
checkError(ciErrNum, CL_SUCCESS, "clEnqueueWriteBuffer");
}
void
copyFromDevice(
const cl_mem dMem,
float* const hostPtr,
const unsigned size
)
{
cl_int ciErrNum = clEnqueueReadBuffer(commandQueue, dMem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);
checkError(ciErrNum, CL_SUCCESS, "clEnqueueReadBuffer");
}
void
runKernelFFT(
const size_t localWorkSize[],
const size_t globalWorkSize[],
const int kk
)
{
const cl_int ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernels[kk], 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &gpuExecution[kk]);
checkError(ciErrNum, CL_SUCCESS, "clEnqueueNDRangeKernel");
}
void Cl_finish(void)
{
clFinish(commandQueue);
}
clutil.h
#ifndef __CLUTIL__
#define __CLUTIL__
#include <CL/opencl.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#ifndef UNDER_CE
#include <sys/types.h>
#endif
int initExecution(const unsigned len);
void checkError(const cl_int ciErrNum, const cl_int ref, const char* const operation);
void printResult(const unsigned size);
void init_cl_context(cl_device_type device_type);
cl_uint getDeviceCount();
cl_uint getNumComputeUnits();
void createCommandQueue();
void compileProgram(const char* const kernel_file);
void createFFTKernel(const char* const kernelName, int kk);
cl_mem createDeviceBuffer(const cl_mem_flags flags, const size_t size, void* const hostPtr);
void runKernelFFT(const size_t localWorkSize[], const size_t globalWorkSize[], const int kk);
void copyToDevice(const cl_mem mem, float* const hostPtr, const unsigned size);
void copyFromDevice(const cl_mem dMem, float* const hostPtr, const unsigned size);
double executionTime(const cl_event event);
void allocateHostMemory(const unsigned len);
void allocateDeviceMemory(const unsigned size, const unsigned copyOffset);
void printGpuTime(const unsigned int kernelCount);
void cleanup();
int runFFT(const unsigned len);
void Cl_finish(void);
// Support 2^16 = 65536 point FFT
#define FFT_MAX_LOG2N 20
#define FFT_MAX (1 << FFT_MAX_LOG2N)
extern unsigned blockSize;
extern unsigned print;
extern float* h_Freal;
extern float* h_Fimag;
extern float* h_Rreal;
extern float* h_Rimag;
extern float* h_intime; // time-domain input samples
extern float* h_outfft; // freq-domain output samples
extern cl_mem d_Freal;
extern cl_mem d_Fimag;
extern cl_mem d_Rreal;
extern cl_mem d_Rimag;
extern cl_mem d_intime; // time-domain input samples
extern cl_mem d_outfft; // freq-domain output samples
extern cl_context cxContext;
extern cl_program cpProgram;
extern cl_kernel kernels[FFT_MAX_LOG2N];
extern cl_event gpuExecution[FFT_MAX_LOG2N];
extern cl_command_queue commandQueue;
#endif
fft.cl
#define M_PI 3.14159265358979f
#define MUL_RE(a, b) (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a, b) (a.even*b.odd + a.odd*b.even)
typedef float2 real2_t;
typedef float real_t;
// complex multiply
real2_t mul(
real2_t a,
real2_t b
)
{
return (real2_t) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); // no mad
}
// twiddle_P_Q(A) returns A * EXP(-P*PI*i/Q)
real2_t
twiddle_1_2(
real2_t a
)
{
// A * (-i)
return (real2_t) (a.y, -a.x);
}
// Return A * exp(K*ALPHA*i)
real2_t
twiddle(
real2_t a,
int k,
real_t alpha
)
{
real_t cs, sn;
//sn = sincos((real_t)k*alpha, &cs);
cs = native_cos((real_t) k * alpha);
sn = native_sin((real_t) k * alpha);
return mul(a, (real2_t) (cs, sn));
}
// Return A * exp(KALPHA*i)
real2_t
twiddle_kalpha(
real2_t a,
real_t kalpha
)
{
real_t cs, sn;
//sn = sincos((real_t) alpha, &cs);
cs = native_cos((real_t) kalpha);
sn = native_sin((real_t) kalpha);
return mul(a, (real2_t) (cs, sn));
}
// In-place DFT-2, output is (a, b). Arguments must be variables.
#define DFT2(a, b) { real2_t tmp = a - b; a += b; b = tmp; }
// Compute T x DFT-2.
// T is the number of threads.
// N = 2*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,2,4,...,T.
// Each DFT-2 has input (X[I],X[I+T]), I=0..T-1,
// and output Y[J], Y|J+P], J = I with one 0 bit inserted at postion P. */
__kernel void
fft_radix2(
__global const real2_t * x,
__global real2_t * y,
int p,
int pminus1,
real_t minusPIoverp
)
{
int t = get_global_size(0); // thread count
int i = get_global_id(0); // thread index
int k = i&pminus1; // index in input sequence, in 0..P-1
int j = ((i-k)<<1) + k; // output index
real_t alpha = minusPIoverp * (real_t) k; // -M_PI*(real_t)k/(real_t)p;
// Read and twiddle input
x += i;
real2_t u0 = x[0];
//real2_t u1 = twiddle(x[t], 1, alpha);
real_t cs,sn;
//sn = sincos(alpha, &cs);
cs = native_cos(alpha);
sn = native_sin(alpha);
real2_t u1 = mul(x[t], (real2_t) (cs, sn));
// In-place DFT-2
DFT2(u0,u1);
// Write output
y += j;
y[0] = u0;
y[p] = u1;
}
// In-place DFT-4, output is (a, c, b, d). Arguments must be variables.
#define DFT4(a, b, c, d) { DFT2(a, c); DFT2(b, d); d=twiddle_1_2(d); DFT2(a, b); DFT2(c, d); }
// Compute T x DFT-4.
// T is the number of threads.
// N = 4*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,4,16,...,T.
// Each DFT-4 has input (X[I],X[I+T],X[I+2*T],X[I+3*T]), I=0..T-1,
// and output (Y[J],Y|J+P],Y[J+2*P],Y[J+3*P], J = I with two 0 bits inserted at postion P.
__kernel void
fft_radix4(
__global const float2 * x,
__global float2 * y,
int p,
int pminus1,
int twop,
int threep,
real_t minusPIover2p,
real_t minusPIover2p_2x,
real_t minusPIover2p_3x
)
{
int t = get_global_size(0); // thread count
int i = get_global_id(0); // thread index
int k = i&pminus1; //(p-1); // index in input sequence, in 0..P-1
int j = ((i - k) << 2) + k; // output index
real_t alpha = minusPIover2p * (real_t) k; //-M_PI*(real_t)k/(real_t)(2*p);
real_t alpha2x = minusPIover2p_2x * (real_t) k;
real_t alpha3x = minusPIover2p_3x * (real_t) k;
// Read and twiddle input
x += i;
real2_t u0 = x[0];
real2_t u1 = twiddle_kalpha(x[t], alpha); //twiddle(x[t], 1, alpha);
real2_t u2 = twiddle_kalpha(x[2*t], alpha2x); //twiddle(x[2*t], 2, alpha);
real2_t u3 = twiddle_kalpha(x[3*t], alpha3x); //twiddle(x[3*t], 3, alpha);
// In-place DFT-4
DFT4(u0, u1, u2, u3);
// Shuffle and write output
y += j;
y[0] = u0;
y[p] = u2;
y[twop] = u1;
y[threep] = u3;
}
将该部分代码拷贝至~/imx8mqevk-rootfs/code/fft
目录下
执行编译命令
$CXX -I../../usr/include -L../../usr/lib -lOpenCL -o fft main.cpp fft.cpp clutil.cpp
# 注意: 这里的$CXX的作用就是交叉编译工具, 他在脚本中设置为环境变量了。
结果如下