IMX8M系列 OpenCL FFT 示例编译及其他demo测试(MYD-JX8MX)

IMX8M系列 OpenCL FFT 示例编译测试及其他demo测试(MYD-JX8MX)

上一篇文章已经将如何编译镜像,如何用官方的方式FslBuild.py 脚本编译demo。不知道有没有成功的朋友,如果你成功了,那么可以依然用这种方式,如果你没有成功,那么你可以参考接下来的方式来编译你的demo

导出文件系统

上一篇文章有描述如何导出文件系统,这里的目的是为了提供编译所需的头文件和库文件等
具体方法如下:

#进入源码root目录
cd fsl-release-yocto
#执行设置环境变量的脚本
. ./setup-environment build-xwayland
#编译工具链
bitbake meta-toolchain
#安装工具链
cd ~/fsl-release-yocto/build-xwayland/tmp/deploy/sdk
./fsl-imx-xwayland-glibc-x86_64-meta-toolchain-aarch64-toolchain-4.9.88-2.0.0.sh
./opt/fsl-imx-xwayland/4.19-warrior/environment-setup-aarch64-poky-linux
#提取文件系统
runqemu-extract-sdk ~/fsl-release-yocto/build-xwayland/tmp/deploy/images/imx8mqevk/fsl-image-qt5-validation-imx-imx8mqevk-20210809020904.rootfs.tar.bz2 ~/imx8mqevk-rootfs

提取好的文件系统在~/imx8mqevk-rootfs 目录下
具体内容如下:
~/imx8mqevk-rootfs
该文件系统时利用交叉编译方式编译出来的,有我们所需的头文件和库等。

编译FFT

这里我利用gtec-demo-frameworkNXP提供的示例来单独编译FFT用于简单的测试其GPU的性能

main.cpp


#include "clutil.h"

int
main(
    const int argc,
    const char* argv[]
    )
{
    if (argc < 2)
    {
        printf("Usage: %s fftlen \n", argv[0]);
        return -1;
    }
    const unsigned len = atoi(argv[1]);

    if (len > FFT_MAX)
    {
        printf("FFT length cannot be greater than %d.\n", FFT_MAX);
        return -1;
    }

    if (len < 16)
    {
        printf("FFT length has to at least be 16.\n");
        return -1;
    }

    if ((len != 1) && (len & (len - 1)))
    {
        printf("FFT length (%d) must be a power-of-2.\n", len);
        return -1;
    }

    printf("Block size: %d \n", blockSize);
    printf("Print result: %s \n", print ? "yes" : "no");

    int result = 0;
    result = runFFT(len);
    if (result == 0)
    {
        printf("Successful.\n");
        if (print) printResult(len);
    }
    else
    {
        printf("Failed.\n");
    }
    cleanup();
}

fft.cpp


#include "clutil.h"
#include <time.h>
static unsigned workOffset;
static unsigned workSize;
static int p[FFT_MAX_LOG2N]       = {  1,   2,   4,   8,   16,   32,   64,   128,   256,   512,   1024,   2048,   4096,   8192,   16384,   32768};
static int twop[FFT_MAX_LOG2N]    = {2*1, 2*2, 2*4, 2*8, 2*16, 2*32, 2*64, 2*128, 2*256, 2*512, 2*1024, 2*2048, 2*4096, 2*8192, 2*16384, 2*32768};
static int threep[FFT_MAX_LOG2N]  = {3*1, 3*2 ,3*4, 3*8, 3*16, 3*32, 3*64, 3*128, 3*256, 3*512, 3*1024, 3*2048, 3*4096, 3*8192, 3*16384, 3*32768};
static int pminus1[FFT_MAX_LOG2N] = {1-1, 2-1, 4-1, 8-1, 16-1, 32-1, 64-1, 128-1, 256-1, 512-1, 1024-1, 2048-1, 4096-1, 8192-1, 16384-1, 32768-1};

#ifndef M_PI
#define M_PI 3.14159265358979f
#endif
static cl_float minusPIoverp[FFT_MAX_LOG2N]     = {    -M_PI,         -M_PI/2.f,     -M_PI/4.f,     -M_PI/ 8.f,     -M_PI/16.f,     -M_PI/32.f,     -M_PI/ 64.f,     -M_PI/128.f,     -M_PI/256.f,     -M_PI/ 512.f,     -M_PI/1024.f,     -M_PI/2048.f,     -M_PI/4096.f,     -M_PI/ 8192.f,     -M_PI/16384.f,     -M_PI/32768.f};
static cl_float minusPIover2p[FFT_MAX_LOG2N]    = {    -M_PI/2.f,     -M_PI/4.f,     -M_PI/8.f,     -M_PI/16.f,     -M_PI/32.f,     -M_PI/64.f,     -M_PI/128.f,     -M_PI/256.f,     -M_PI/512.f,     -M_PI/1024.f,     -M_PI/2048.f,     -M_PI/4096.f,     -M_PI/8192.f,     -M_PI/16384.f,     -M_PI/32768.f,     -M_PI/65536.f};
static cl_float minusPIover2p_2x[FFT_MAX_LOG2N] = {-2.f*M_PI/2.f, -2.f*M_PI/4.f, -2.f*M_PI/8.f, -2.f*M_PI/16.f, -2.f*M_PI/32.f, -2.f*M_PI/64.f, -2.f*M_PI/128.f, -2.f*M_PI/256.f, -2.f*M_PI/512.f, -2.f*M_PI/1024.f, -2.f*M_PI/2048.f, -2.f*M_PI/4096.f, -2.f*M_PI/8192.f, -2.f*M_PI/16384.f, -2.f*M_PI/32768.f, -2.f*M_PI/65536.f};
static cl_float minusPIover2p_3x[FFT_MAX_LOG2N] = {-3.f*M_PI/2.f, -3.f*M_PI/4.f, -3.f*M_PI/8.f, -3.f*M_PI/16.f, -3.f*M_PI/32.f, -3.f*M_PI/64.f, -3.f*M_PI/128.f, -3.f*M_PI/256.f, -3.f*M_PI/512.f, -3.f*M_PI/1024.f, -3.f*M_PI/2048.f, -3.f*M_PI/4096.f, -3.f*M_PI/8192.f, -3.f*M_PI/16384.f, -3.f*M_PI/32768.f, -3.f*M_PI/65536.f};

static int
radix(
    int N
    )
{
    int i = 0, j = 0;
    for (; i <= 31; i++)
    {
        if ((N & (1 << i)) == 0)
        {
            j++;
        }
        else
        {
            break;
        }
    }
    return (0 == (j%2)) ? 4 : 2;
}

static unsigned int
log2NFFT(
    unsigned int size
    )
{
    unsigned int v = size;
    unsigned int log2n = 0;
    while (v >>= 1)
    {
        log2n++;
    }
    return log2n;
}

#define RADIX2_FFT_KERNEL "fft_radix2"
#define RADIX4_FFT_KERNEL "fft_radix4"

static void
FFTGpu(
    const unsigned len
    )
{
    if (len == 0)
    {
        return;
    }

    // figure out if we can use a radix-4 FFT : otherwise radix-2
    int rad = radix(len);
    if (4==rad && ((16==len) || (256==len) || (4096==len) || (65536==len) || (1048576 == len) ))
        rad = 2;

    // log2(n) is the # of kernels that will be invoked (for a radix-2 FFT)
    unsigned int log2n = log2NFFT(len);
    printf("log2(fft size) = log2(%d)=%d\n", len, log2n);

    printf("Compiling  radix-%d FFT Program for GPU...\n", rad);
    compileProgram("fft.cl");
    printf("creating radix-%d kernels...\n", rad);

    if (2 == rad)
    {
        for (unsigned kk = 0; kk < log2n; kk++)
        {
            printf("Creating kernel %s %d (p=%d)...\n", RADIX2_FFT_KERNEL, kk, p[kk]);
            createFFTKernel(RADIX2_FFT_KERNEL, kk);
        }
    }
    else
    { // radix-4
        for (unsigned kk = 0; kk < log2n; kk+=2)
        {
            printf("Creating kernel %s %d...\n", RADIX4_FFT_KERNEL, kk>>1);
            createFFTKernel(RADIX4_FFT_KERNEL, kk>>1);
        }
    }

    workSize = len;

    allocateDeviceMemory(workSize, workOffset);

    if (2 == rad)
    {
        // FFT kernel invoked for p=1, p=2, ..., p=n/2
        // input and output swapped each time
        for (unsigned kk = 0; kk < log2n; kk++)
        {
            void *in = (0 == (kk&1)) ? &d_intime : &d_outfft;
            void *out = (0 == (kk&1)) ? &d_outfft : &d_intime;
            printf("Setting kernel args for kernel %d (p=%d)...\n", kk, p[kk]);
            clSetKernelArg(kernels[kk], 0, sizeof(cl_mem), in);
            clSetKernelArg(kernels[kk], 1, sizeof(cl_mem), out);
            clSetKernelArg(kernels[kk], 2, sizeof(unsigned), &p[kk]);
            clSetKernelArg(kernels[kk], 3, sizeof(unsigned), &pminus1[kk]);
            clSetKernelArg(kernels[kk], 4, sizeof(cl_float), &minusPIoverp[kk]);
        } // end (for 1,2,4,8,...N/2)
    }
    else
    {
        // radix-4, FFT kernel invoked for p=1, p=4, ..., p=n/4
        for (unsigned kk = 0; kk < log2n; kk+=2)
        {
            int idx   = kk>>1;
            void *in  = (0 == (idx&1)) ? &d_intime : &d_outfft;
            void *out = (0 == (idx&1)) ? &d_outfft : &d_intime;
            printf("Setting kernel args for kernel %d (p=%d)...\n", idx, p[kk]);
            clSetKernelArg(kernels[idx], 0, sizeof(cl_mem), in);
            clSetKernelArg(kernels[idx], 1, sizeof(cl_mem), out);
            clSetKernelArg(kernels[idx], 2, sizeof(unsigned), &p[kk]);
            clSetKernelArg(kernels[idx], 3, sizeof(unsigned), &pminus1[kk]);
            clSetKernelArg(kernels[idx], 4, sizeof(unsigned), &twop[kk]);
            clSetKernelArg(kernels[idx], 5, sizeof(unsigned), &threep[kk]);
            clSetKernelArg(kernels[idx], 6, sizeof(cl_float), &minusPIover2p[kk]);
            clSetKernelArg(kernels[idx], 7, sizeof(cl_float), &minusPIover2p_2x[kk]);
            clSetKernelArg(kernels[idx], 8, sizeof(cl_float), &minusPIover2p_3x[kk]);
        } // end (for 1,4,16,...,N/4)
    } // end (if radix-2 or radix-4)

    size_t globalWorkSize[] = { (2==rad) ? (1<<(log2n-1)) : (len>>2) };
    size_t localWorkSize[] = { (blockSize <= globalWorkSize[0]) ? blockSize : globalWorkSize[0] };

    cl_int ciErrNum = 0;
    cl_mem d_result;
    clock_t start,end1,end2,end3;
    start = clock();
    Cl_finish();
    if (2==rad)
    {
        for (unsigned kk = 0; kk < log2n; kk++)
        {
            // note to self: up to 8 it works, beyond that it does not
            printf("running kernel %d (p=%d)...\n", kk, p[kk]);
            runKernelFFT(localWorkSize, globalWorkSize, kk);
            d_result = (0 == (kk&1)) ? d_outfft : d_intime;
        }
    }
    else
    {
        // radix-4
        for (unsigned kk = 0; kk < log2n; kk+=2)
        {
            int idx = kk>>1;
            printf("running kernel %d (p=%d)...\n", idx, p[kk]);
            runKernelFFT(localWorkSize, globalWorkSize, idx);
            d_result = (0 == (kk&1)) ? d_outfft : d_intime;
        }
    }
    Cl_finish();
    end1 = clock();
    //printf("time_1: %f s\n",double(end-start)/CLOCKS_PER_SEC);
    copyFromDevice(d_result, h_outfft + workOffset,  2*workSize);
    end2 = clock();
    //printf("time_2: %f s\n",double(end-start)/CLOCKS_PER_SEC);
    printGpuTime((2==rad)?log2n:(log2n>>1));
    end3 = clock();
    printf("time_1: %f s\ntime_2: %f s\ntime_3: %f s\n",\
	double(end1-start)/CLOCKS_PER_SEC,\
	double(end2-start)/CLOCKS_PER_SEC,\
	double(end3-start)/CLOCKS_PER_SEC);
}

int
runFFT(
    const unsigned len
    )
{
    cl_int err;
    err = initExecution(len);
    if (err)
    {
        return err;
    }
    FFTGpu(len);
    return 0;
}

clutil.cpp


#include "clutil.h"
#ifdef UNDER_CE
#include <windows.h>
#endif

// global variables
cl_context cxContext = 0;
cl_program cpProgram = 0;
cl_device_id cdDeviceID[2];
cl_kernel kernels[FFT_MAX_LOG2N];
cl_command_queue commandQueue;
cl_event gpuExecution[FFT_MAX_LOG2N];

#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))

// default configs
unsigned blockSize = 16;
unsigned print = 1;

// h_Freal and h_Fimag represent the input signal to be transformed.
// h_Rreal and h_Rimag represent the transformed output.
float*  h_Freal = 0;
float*  h_Fimag = 0;
float*  h_Rreal = 0;
float*  h_Rimag = 0;
//  real & imag interleaved
float* h_intime = 0; // time-domain input samples
float* h_outfft = 0; // freq-domain output samples

// d_Freal and d_Fimag represent the input signal to be transformed.
// d_Rreal and d_Rimag represent the transformed output.
cl_mem d_Freal;
cl_mem d_Fimag;
cl_mem d_Rreal;
cl_mem d_Rimag;
//  real & imag interleaved
cl_mem d_intime; // time-domain input samples
cl_mem d_outfft; // freq-domain output samples

int
initExecution(
    const unsigned len
    )
{
    // Allocate host memory (and initialize input signal)
    allocateHostMemory(len);

    printf("Initializing device(s)...\n");
    // create the OpenCL context on available GPU devices
    init_cl_context(CL_DEVICE_TYPE_GPU);

    const cl_uint ciDeviceCount =  getDeviceCount();
    printf("ciDeviceCount:%d \n",ciDeviceCount);
    if (!ciDeviceCount)
    {
        printf("No opencl specific devices!\n");
        return -1;
    }
    const cl_uint ciComputeUnitsCount = getNumComputeUnits();
    printf("# compute units = %d\n", ciComputeUnitsCount);

    printf("Creating Command Queue...\n");
    // create a command queue on device 0
    createCommandQueue();

    return 0;
}

void
printGpuTime(
    const unsigned int kernelCount
    )
{
    double t, total = 0;

    for (unsigned k = 0; k<kernelCount; ++k)
    {
        t = executionTime(gpuExecution[k]);
        printf("Kernel execution time on GPU (kernel %d) : %10.6f seconds\n", k, t);
        total += t;
    }
    printf("Total Kernel execution time on GPU : %10.6f seconds\n",total);
}

void
printResult(
    const unsigned size
    )
{
    FILE *fp;
#ifdef UNDER_CE
    wchar_t moduleName[MAX_PATH];
    char path[MAX_PATH], * p;
    GetModuleFileName(NULL, moduleName, MAX_PATH);
    wcstombs(path, moduleName, MAX_PATH);
    p = strrchr(path, '\\');
    strcpy(p + 1, "fft_output.csv");
    fp = fopen(path, "w+");
#else
   fp = fopen("fft_output.csv", "w+");
#endif

    if (fp == NULL) return;

    for (unsigned i = 0; i < size; ++i)
    {
        fprintf(fp, "%f,%f\n", h_outfft[2*i], h_outfft[2*i+1]);
    }
    fclose(fp);
}

double
executionTime(
    const cl_event event
    )
{
    cl_ulong start, end;
    cl_int err;
    err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL);
    err |= clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL);
    if (err)
    {
        return 0;
    }
    printf("start:%llu, end:%llu \n", start, end);
    return (double)1.0e-9 * (end - start); // convert nanoseconds to seconds
}

void
allocateHostMemory(
    const unsigned len
    )
{
    h_Freal = (float *) malloc(sizeof(float) * len);
    checkError((h_Freal != NULL), CL_TRUE, "Could not allocate memory");

    h_Fimag = (float *) malloc(sizeof(float) * len);
    checkError((h_Fimag != NULL), CL_TRUE, "Could not allocate memory");

    h_Rreal = (float *) malloc(sizeof(float) * len);
    checkError((h_Rreal != NULL), CL_TRUE, "Could not allocate memory");

    h_Rimag = (float *) malloc(sizeof(float) * len);
    checkError((h_Rimag != NULL), CL_TRUE, "Could not allocate memory");

    //  real/imag interleaved input time-domain samples
    h_intime = (float *) malloc(sizeof(float) * len * 2);
    checkError((h_intime != NULL), CL_TRUE, "Could not allocate memory");

    //  real/imag interleaved output FFT data
    h_outfft = (float *) malloc(sizeof(float) * len * 2);
    checkError((h_outfft != NULL), CL_TRUE, "Could not allocate memory");

    const unsigned n = 16;
    for (unsigned i = 0 ; i < len; ++i)
    {
        h_Freal[i] = (i + 1) % n;
        h_Fimag[i] = (i + 1) % n;
        h_intime[2*i] = h_intime[2*i+1] = (i + 1) % n;
        h_Rreal[i] = 0;
        h_Rimag[i] = 0;
        h_outfft[2*i] = h_outfft[2*i+1] = 0;
    }

    if (print)
    {
        FILE *fp = NULL;
#ifdef UNDER_CE
        wchar_t moduleName[MAX_PATH];
        char path[MAX_PATH], * p;
        GetModuleFileName(NULL, moduleName, MAX_PATH);
        wcstombs(path, moduleName, MAX_PATH);
        p = strrchr(path, '\\');
        strcpy(p + 1, "fft_input.csv");
        fp = fopen(path, "w+");
#else
        fp = fopen("fft_input.csv", "w+");
#endif
        if (fp == NULL) return;

        for (unsigned int kk=0; kk<len; kk++)
        {
            fprintf(fp, "%f,%f\n", h_intime[2*kk], h_intime[2*kk+1]);
        }
        fclose(fp);
    }
}

void
allocateDeviceMemory(
    const unsigned size,
    const unsigned copyOffset
    )
{
    d_Freal = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Freal + copyOffset);
    copyToDevice(d_Freal,  h_Freal + copyOffset, size);

    d_Fimag = createDeviceBuffer(CL_MEM_READ_ONLY, sizeof(float) * size, h_Fimag + copyOffset);
    copyToDevice(d_Fimag,  h_Fimag + copyOffset, size);

    //  copy real/imag interleaved input data to device
    d_intime = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_intime + copyOffset * 2);
    copyFromDevice(d_intime, h_outfft, size * 2); // debug

    d_Rreal = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rreal + copyOffset);
    copyToDevice(d_Rreal,  h_Rreal + copyOffset, size);

    d_Rimag = createDeviceBuffer(CL_MEM_WRITE_ONLY, sizeof(float) * size, h_Rimag + copyOffset);
    copyToDevice(d_Rimag,  h_Rimag + copyOffset, size);

    //  copy real/imag interleaved out FFT to device
    d_outfft = createDeviceBuffer(CL_MEM_READ_WRITE, sizeof(float) * size * 2, h_outfft + copyOffset * 2);
    copyToDevice(d_intime,  h_outfft + copyOffset * 2, size * 2);
}

void
cleanup(
    void
    )
{
    if (d_Freal)  clReleaseMemObject(d_Freal);
    if (d_Fimag)  clReleaseMemObject(d_Fimag);
    if (d_Rreal)  clReleaseMemObject(d_Rreal);
    if (d_Rimag)  clReleaseMemObject(d_Rimag);
    if (d_intime) clReleaseMemObject(d_intime);
    if (d_outfft) clReleaseMemObject(d_outfft);

    for (unsigned kk=0; kk<ARRAY_SIZE(kernels); kk++) {
        if (gpuExecution[kk]) clReleaseEvent(gpuExecution[kk]);
    }

    if (commandQueue) clReleaseCommandQueue(commandQueue);
    if (cpProgram) clReleaseProgram(cpProgram);
    if (cxContext) clReleaseContext(cxContext);

    free(h_Freal);
    h_Freal = 0;
    free(h_Fimag);
    h_Fimag = 0;
    free(h_Rreal);
    h_Rreal = 0;
    free(h_Rimag);
    h_Rimag = 0;
    free(h_intime);
    h_intime = 0;
    free(h_outfft);
    h_outfft = 0;
}

void
checkError(
    const cl_int ciErrNum,
    const cl_int ref,
    const char* const operation
    )
{
    if (ciErrNum != ref) {
        printf("ERROR:: %d %s failed\n\n", ciErrNum, operation);
        cleanup();
        exit(EXIT_FAILURE);
    }
}

void
init_cl_context(
    const cl_device_type device_type
    )
{
    cl_int ciErrNum = CL_SUCCESS;

#ifndef WIN32
    cxContext = clCreateContextFromType(0, /* cl_context_properties */
                          device_type,
                        NULL, /* error function ptr */
                        NULL, /* user data to be passed to err fn */
                        &ciErrNum);
    checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#else
    cl_platform_id cpPlatform;
    ciErrNum =     clGetPlatformIDs(1, &cpPlatform, NULL);
    checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");
    cl_uint uiNumDevices;
    ciErrNum = clGetDeviceIDs(cpPlatform, device_type, 0, NULL, &uiNumDevices);
    checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");
    cl_device_id cdDevices[20];
    ciErrNum = clGetDeviceIDs(cpPlatform, device_type, uiNumDevices, cdDevices, NULL);
    checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");
    cl_uint targetDevice=0, uiNumDevsUsed=1;
    cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[targetDevice], NULL, NULL, &ciErrNum);
    checkError(ciErrNum, CL_SUCCESS, "clCreateContextFromType");
#endif
}

cl_uint
getDeviceCount(
    void
    )
{
    size_t nDeviceBytes;
    const cl_int ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, 0, NULL, &nDeviceBytes);
    checkError(ciErrNum, CL_SUCCESS, "clGetContextInfo");
    return ((cl_uint)nDeviceBytes/sizeof(cl_device_id));
}

cl_uint
getNumComputeUnits(
    void
    )
{
    cl_platform_id cpPlatform;
    cl_int ciErrNum = clGetPlatformIDs(1, &cpPlatform, NULL);
    checkError(ciErrNum, CL_SUCCESS, "clGetPlatformIDs");

    //Get all the devices
    printf("Get the Device info and select Device...\n");
    cl_uint uiNumDevices;
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices);
    checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");
    cl_device_id *cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) );
    ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL);
    checkError(ciErrNum, CL_SUCCESS, "clGetDeviceIDs");

    // Set target device and Query number of compute units on targetDevice
    printf("# of Devices Available = %d\n", uiNumDevices);
    cl_uint num_compute_units;
    clGetDeviceInfo(cdDevices[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(num_compute_units), &num_compute_units, NULL);
    printf("# of Compute Units = %d\n", num_compute_units);
    free(cdDevices);
    return num_compute_units;
}

void
createCommandQueue(
    void
    )
{
    cl_int ciErrNum = CL_SUCCESS;
    ciErrNum = clGetContextInfo(cxContext, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*2, &cdDeviceID, NULL);
    commandQueue = clCreateCommandQueue(cxContext, cdDeviceID[0], CL_QUEUE_PROFILING_ENABLE, &ciErrNum);
    checkError(ciErrNum, CL_SUCCESS, "clCreateCommandQueue");
}

void
compileProgram(
    const char* const kernel_file
    )
{
    size_t program_length;
    FILE* pFileStream = NULL;
    cl_int ciErrNum;

#ifdef _WIN32
#ifdef UNDER_CE
    wchar_t moduleName[MAX_PATH];
    char path[MAX_PATH], * p;
    GetModuleFileName(NULL, moduleName, MAX_PATH);
    wcstombs(path, moduleName, MAX_PATH);
    p = strrchr(path, '\\');
    strcpy(p + 1, kernel_file);
    pFileStream = fopen(path, "rb");
    if (pFileStream == NULL)
    {
        checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");
    }
#else
    if(fopen_s(&pFileStream, kernel_file, "rb") != 0)
    {
        checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");
    }
#endif
#else
    pFileStream = fopen(kernel_file, "rb");
    if(pFileStream == 0)
    {
        checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on open source");
    }
#endif

    // get the length of the source code
    fseek(pFileStream, 0, SEEK_END);
    program_length = ftell(pFileStream);
    fseek(pFileStream, 0, SEEK_SET);

    // allocate a buffer for the source code string and read it in
    char* source = (char *)malloc(program_length + 1);
    if (fread((source), program_length, 1, pFileStream) != 1)
    {
        fclose(pFileStream);
        free(source);
        checkError(CL_INVALID_VALUE, CL_SUCCESS, "compileProgram on read source");
    }
    fclose(pFileStream);
    source[program_length] = '\0';

    // Create the program for all GPUs in the context
    cpProgram = clCreateProgramWithSource( cxContext, 1, (const char **) &source, &program_length, &ciErrNum);
    free(source);
    checkError(ciErrNum, CL_SUCCESS, "clCreateProgramWithSource");
    ciErrNum = clBuildProgram(cpProgram, 0, NULL, "", NULL, NULL);
    if (ciErrNum != CL_SUCCESS)
    {
        char cBuildLog[10240];
        clGetProgramBuildInfo(cpProgram, cdDeviceID[0], CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL );
        printf("\nBuild Log : \n%s\n", cBuildLog);
        checkError(ciErrNum, CL_SUCCESS, "clBuildProgram");
    }
}

void
createFFTKernel(
    const char* const kernelName,
    int kk
    )
{
    cl_int ciErrNum = CL_SUCCESS;
    kernels[kk] = clCreateKernel(cpProgram, kernelName, &ciErrNum);
    checkError(ciErrNum, CL_SUCCESS, "clCreateKernel");
}

cl_mem
createDeviceBuffer(
    const cl_mem_flags flags,
    const size_t size,
    void* const hostPtr
    )
{
    cl_int ciErrNum = CL_SUCCESS;
    const cl_mem d_mem = clCreateBuffer(cxContext, flags | CL_MEM_COPY_HOST_PTR, size, hostPtr, &ciErrNum);
    checkError(ciErrNum, CL_SUCCESS,  "clCreateBuffer");
    return d_mem;
}

void
copyToDevice(
    const cl_mem mem,
    float* const hostPtr,
    const unsigned size
    )
{
    const cl_int ciErrNum = clEnqueueWriteBuffer(commandQueue, mem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);
    checkError(ciErrNum, CL_SUCCESS,  "clEnqueueWriteBuffer");
}

void
copyFromDevice(
    const cl_mem dMem,
    float* const hostPtr,
    const unsigned size
    )
{
    cl_int ciErrNum = clEnqueueReadBuffer(commandQueue, dMem, CL_TRUE, 0, sizeof(float) * size, hostPtr, 0, NULL, NULL);
    checkError(ciErrNum, CL_SUCCESS, "clEnqueueReadBuffer");
}

void
runKernelFFT(
    const size_t localWorkSize[],
    const size_t globalWorkSize[],
    const int kk
    )
{
    const cl_int ciErrNum = clEnqueueNDRangeKernel(commandQueue, kernels[kk], 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &gpuExecution[kk]);
    checkError(ciErrNum, CL_SUCCESS, "clEnqueueNDRangeKernel");
}

void Cl_finish(void)
{
    clFinish(commandQueue);
}


clutil.h


#ifndef __CLUTIL__
#define __CLUTIL__

#include <CL/opencl.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#ifndef UNDER_CE
#include <sys/types.h>
#endif

int initExecution(const unsigned len);
void checkError(const cl_int ciErrNum, const cl_int ref, const char* const operation);
void printResult(const unsigned size);
void init_cl_context(cl_device_type device_type);
cl_uint getDeviceCount();
cl_uint getNumComputeUnits();
void createCommandQueue();
void compileProgram(const char* const kernel_file);
void createFFTKernel(const char* const kernelName, int kk);
cl_mem createDeviceBuffer(const cl_mem_flags flags, const size_t size, void* const  hostPtr);
void runKernelFFT(const size_t localWorkSize[], const size_t globalWorkSize[], const int kk);
void copyToDevice(const cl_mem mem, float* const hostPtr, const unsigned size);
void copyFromDevice(const cl_mem dMem, float* const hostPtr, const unsigned size);
double executionTime(const cl_event event);
void allocateHostMemory(const unsigned len);
void allocateDeviceMemory(const unsigned size, const unsigned copyOffset);
void printGpuTime(const unsigned int kernelCount);
void cleanup();
int runFFT(const unsigned len);
void Cl_finish(void);

// Support 2^16 = 65536 point FFT
#define FFT_MAX_LOG2N   20
#define FFT_MAX         (1 << FFT_MAX_LOG2N)

extern unsigned blockSize;
extern unsigned print;

extern float*  h_Freal;
extern float*  h_Fimag;
extern float*  h_Rreal;
extern float*  h_Rimag;
extern float*  h_intime; // time-domain input samples
extern float*  h_outfft; // freq-domain output samples

extern cl_mem d_Freal;
extern cl_mem d_Fimag;
extern cl_mem d_Rreal;
extern cl_mem d_Rimag;
extern cl_mem d_intime; // time-domain input samples
extern cl_mem d_outfft; // freq-domain output samples

extern cl_context cxContext;
extern cl_program cpProgram;
extern cl_kernel kernels[FFT_MAX_LOG2N];
extern cl_event gpuExecution[FFT_MAX_LOG2N];
extern cl_command_queue commandQueue;

#endif

fft.cl

#define M_PI            3.14159265358979f
#define MUL_RE(a, b)    (a.even*b.even - a.odd*b.odd)
#define MUL_IM(a, b)    (a.even*b.odd + a.odd*b.even)

typedef float2 real2_t;
typedef float real_t;

// complex multiply
real2_t mul(
    real2_t a,
    real2_t b
    )
{
    return (real2_t) (a.x*b.x - a.y*b.y, a.x*b.y + a.y*b.x); // no mad
}

// twiddle_P_Q(A) returns A * EXP(-P*PI*i/Q)
real2_t
twiddle_1_2(
    real2_t a
    )
{
    // A * (-i)
    return (real2_t) (a.y, -a.x);
}

// Return A * exp(K*ALPHA*i)
real2_t
twiddle(
    real2_t a,
    int k,
    real_t alpha
    )
{
    real_t cs, sn;
    //sn = sincos((real_t)k*alpha, &cs);
    cs = native_cos((real_t) k * alpha);
    sn = native_sin((real_t) k * alpha);

    return mul(a, (real2_t) (cs, sn));
}

// Return A * exp(KALPHA*i)
real2_t
twiddle_kalpha(
    real2_t a,
    real_t kalpha
    )
{
    real_t cs, sn;
    //sn = sincos((real_t) alpha, &cs);
    cs = native_cos((real_t) kalpha);
    sn = native_sin((real_t) kalpha);
    return mul(a, (real2_t) (cs, sn));
}

// In-place DFT-2, output is (a, b).  Arguments must be variables.
#define DFT2(a, b)  { real2_t tmp = a - b; a += b; b = tmp; }

// Compute T x DFT-2.
// T is the number of threads.
// N = 2*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,2,4,...,T.
// Each DFT-2 has input (X[I],X[I+T]), I=0..T-1,
// and output Y[J], Y|J+P], J = I with one 0 bit inserted at postion P. */
__kernel void
fft_radix2(
    __global const real2_t * x,
    __global real2_t * y,
    int p,
    int pminus1,
    real_t minusPIoverp
    )
{
    int t = get_global_size(0); // thread count
    int i = get_global_id(0);   // thread index
    int k = i&pminus1;          // index in input sequence, in 0..P-1
    int j = ((i-k)<<1) + k;     // output index
    real_t alpha = minusPIoverp * (real_t) k;  // -M_PI*(real_t)k/(real_t)p;

    // Read and twiddle input
    x += i;
    real2_t u0 = x[0];
    //real2_t u1 = twiddle(x[t], 1, alpha);
    real_t cs,sn;
    //sn = sincos(alpha, &cs);
    cs = native_cos(alpha);
    sn = native_sin(alpha);
    real2_t u1 = mul(x[t], (real2_t) (cs, sn));

    // In-place DFT-2
    DFT2(u0,u1);

    // Write output
    y += j;
    y[0] = u0;
    y[p] = u1;
}

// In-place DFT-4, output is (a, c, b, d). Arguments must be variables.
#define DFT4(a, b, c, d) { DFT2(a, c); DFT2(b, d); d=twiddle_1_2(d); DFT2(a, b); DFT2(c, d); }

// Compute T x DFT-4.
// T is the number of threads.
// N = 4*T is the size of input vectors.
// X[N], Y[N]
// P is the length of input sub-sequences: 1,4,16,...,T.
// Each DFT-4 has input (X[I],X[I+T],X[I+2*T],X[I+3*T]), I=0..T-1,
// and output (Y[J],Y|J+P],Y[J+2*P],Y[J+3*P], J = I with two 0 bits inserted at postion P.
__kernel void
fft_radix4(
    __global const float2 * x,
    __global float2 * y,
    int p,
    int pminus1,
    int twop,
    int threep,
    real_t minusPIover2p,
    real_t minusPIover2p_2x,
    real_t minusPIover2p_3x
    )
{
    int t = get_global_size(0); // thread count
    int i = get_global_id(0);   // thread index
    int k = i&pminus1;          //(p-1); // index in input sequence, in 0..P-1
    int j = ((i - k) << 2) + k; // output index
    real_t alpha   = minusPIover2p    * (real_t) k; //-M_PI*(real_t)k/(real_t)(2*p);
    real_t alpha2x = minusPIover2p_2x * (real_t) k;
    real_t alpha3x = minusPIover2p_3x * (real_t) k;

    // Read and twiddle input
    x += i;
    real2_t u0 = x[0];
    real2_t u1 = twiddle_kalpha(x[t],   alpha);     //twiddle(x[t],   1, alpha);
    real2_t u2 = twiddle_kalpha(x[2*t], alpha2x);   //twiddle(x[2*t], 2, alpha);
    real2_t u3 = twiddle_kalpha(x[3*t], alpha3x);   //twiddle(x[3*t], 3, alpha);

    // In-place DFT-4
    DFT4(u0, u1, u2, u3);

    // Shuffle and write output
    y        += j;
    y[0]      = u0;
    y[p]      = u2;
    y[twop]   = u1;
    y[threep] = u3;
}

将该部分代码拷贝至~/imx8mqevk-rootfs/code/fft目录下
执行编译命令

$CXX -I../../usr/include -L../../usr/lib -lOpenCL -o fft main.cpp fft.cpp clutil.cpp 
# 注意:  这里的$CXX的作用就是交叉编译工具, 他在脚本中设置为环境变量了。

结果如下
在这里插入图片描述

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

疯狂的蕉尼基

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值