英特尔 oneAPI 黑客松

最新推荐文章于 2024-10-05 11:02:16 发布

Naweizhw

最新推荐文章于 2024-10-05 11:02:16 发布

阅读量72

点赞数 1

文章标签： oneapi 算法

本文链接：https://blog.csdn.net/Naweizhw/article/details/132695280

版权

英特尔 oneAPI 黑客松

文章目录

英特尔 oneAPI 黑客松

快速傅里叶变换

快速傅里叶变换（Fast Fourier Transform, FFT），是快速计算序列的离散傅里叶变换（DFT）或其逆变换的方法。FFT 能够将计算 DFT 的复杂度从𝑂(𝑛2)降低到 𝑂(𝑛𝑙𝑜𝑔(𝑛))，其中𝑛为数据大小。
FFT 算法的关键思想是利用信号的对称性和周期性，通过分而治之的策略来加速傅里叶变换的计算，从而大幅提高了计算率。通过将信号分解成多个子信号，然后逐步合并子信号的频谱，从而实现对整体频谱的计算。

使用oneMKL对FFT进行加速和优化

Intel oneMKL（Intel oneAPI Math Kernel Library）是一款高性能数学核心库，旨在为开发人员提供丰富的数学函数和优化算法，以加速各种科学、工程和数据分析应用的性能。oneMKL 是 Intel oneAPI 套件的一部分，适用于多种硬件架构，包括 Intel CPU、GPU 以及其他加速器。

问题简介

调用 oneMKL 相应 API 函数，产生 2048 * 2048 个随机单精度实数()；
根据 1 产生的随机数据作为输入，实现两维 Real to complex FFT 参考代码；
根据 1 产生的随机数据作为输入，调用 oneMKL API 计算两维 Real to complex FFT；
结果正确性验证，对 2 和 3 计算的两维 FFT 输出数据进行全数据比对（允许适当精度误差），输出 “结果正确”或“结果不正确”信息；
平均性能数据比对（比如运行 1000 次），输出 FFT 参考代码平均运行时间和 oneMKL FFT 平均运行时间。

环境准备

参考Linux下Intel编译器oneAPI安装和链接MKL库编译_intel mkl 编译_XianJiaoMu的博客-CSDN博客这篇博客，在linux下安装最新版的MKL库，同时安装HPC toolkit，HPC包含C/C++ 和 Fortran的编译器。

实验代码

首先需要首先需要产生 2048 * 2048 个随机单精度实数，可以通过调用oneMKL的随机数生成器vslNewStream生成

        float *input = (float*)malloc(width * height * sizeof(float));

        //生成单精度随机数
        VSLStreamStatePtr stream;
        vslNewStream(&stream, VSL_BRNG_MT19937, 1010);
        vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, width * height, input, 0.0f, 1.0f);

然后调用fftw3.h中fftw3库的api创建fftw计划，并计算FFT变换的结果，同时记录一次计算所需的时间

 //FFTW输出数据
        fftwf_complex* fftw3_output = (fftwf_complex*)fftwf_malloc((width / 2 + 1) * height * sizeof(fftwf_complex));
        //创建FFTW计划
        fftwf_plan fftw3_plan = fftwf_plan_dft_r2c_2d(width, height, input, fftw3_output, FFTW_ESTIMATE);
        
        //fftw3计算
        struct timespec fftw3_start_time, fftw3_end_time;
        clock_gettime(CLOCK_MONOTONIC, &fftw3_start_time);
        fftwf_execute(fftw3_plan);
        clock_gettime(CLOCK_MONOTONIC, &fftw3_end_time);
        // 计算运行时间，单位为纳秒
        fftw3_time += (fftw3_end_time.tv_sec - fftw3_start_time.tv_sec) * 1000000000LL + (fftw3_end_time.tv_nsec - fftw3_start_time.tv_nsec);

接着调用 oneMKL API ，创建 oneMKL描述符，计算FFT变换的结果，同时记录一次计算所需的时间

//MKL输出数据
        MKL_LONG dim_sizes[2] = {width, height};
        MKL_Complex8* mkl_output = (MKL_Complex8*)malloc((width / 2 + 1) * height * sizeof(MKL_Complex8));
        //设置步长
        MKL_LONG input_stride[3] = {0, width, 1 };
        MKL_LONG output_stride[3] = {0, width / 2 + 1, 1};
        //设置oneMKL参数
        DFTI_DESCRIPTOR_HANDLE handle = NULL;
        DftiCreateDescriptor(&handle, DFTI_SINGLE, DFTI_REAL, 2, dim_sizes);
        DftiSetValue(handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE); //设置非原地变换
        DftiSetValue(handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); //设置共轭存储
        DftiSetValue(handle, DFTI_INPUT_STRIDES, input_stride);
        DftiSetValue(handle, DFTI_OUTPUT_STRIDES, output_stride);
        DftiCommitDescriptor(handle);

        //oneMKL计算
        struct timespec mkl_start_time, mkl_end_time;
        clock_gettime(CLOCK_MONOTONIC, &mkl_start_time);
        DftiComputeForward(handle, input, mkl_output);
        clock_gettime(CLOCK_MONOTONIC, &mkl_end_time);
        mkl_time += (mkl_end_time.tv_sec - mkl_start_time.tv_sec) * 1000000000LL + (mkl_end_time.tv_nsec - mkl_start_time.tv_nsec);

对使用fftw和oneMKL的输出进行比对，设定的精度误差允许范围为1e-6，分别比较二者计算FFT后的复数的实部和虚部之间的差异，超过该误差则认为结果不正确，全部在误差允许范围内则认为计算正确。

        //比较不同计算方法结果
        int notmatch_count = 0;
        for(int j = 0; j < (width / 2 + 1) * height; j++){
            float real_diff = fabs(fftw3_output[j][0] - mkl_output[j].real);
            float imag_diff = fabs(fftw3_output[j][1] - mkl_output[j].imag);
            if(real_diff > 1e-6 || imag_diff > 1e-6){
                notmatch_count++;
            }
        }

        if(notmatch_count > 0){
            printf("第%d轮计算结果不正确! %d \n", i, notmatch_count);
            not_correct_count++;
        }

设定循环运行一千次，比较平均运行时间，注意每轮循环结束时需要先释放资源

           //释放资源
           fftwf_destroy_plan(fftw3_plan);
           fftwf_free(fftw3_output);
           DftiFreeDescriptor(&handle);
           free(input);
           free(mkl_output);

完整代码如下：

#include<stdio.h>
#include <fftw3.h>
#include <mkl.h>
#include <time.h>
#include <math.h>

int main(){
    int width = 2048; //输入数据的宽度
    int height = 2048; //输入数据的高度
    long long fftw3_time = 0;
    long long mkl_time = 0;
    int iteration = 1000;
    int not_correct_count = 0;
    
    //运行1000次
    for(int i = 0; i < iteration; i++){
        float *input = (float*)malloc(width * height * sizeof(float));

        //生成单精度随机数
        VSLStreamStatePtr stream;
        vslNewStream(&stream, VSL_BRNG_MT19937, 1010);
        vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, width * height, input, 0.0f, 1.0f);
        
        //FFTW输出数据
        fftwf_complex* fftw3_output = (fftwf_complex*)fftwf_malloc((width / 2 + 1) * height * sizeof(fftwf_complex));
        //创建FFTW计划
        fftwf_plan fftw3_plan = fftwf_plan_dft_r2c_2d(width, height, input, fftw3_output, FFTW_ESTIMATE);
        
        //fftw3计算
        struct timespec fftw3_start_time, fftw3_end_time;
        clock_gettime(CLOCK_MONOTONIC, &fftw3_start_time);
        fftwf_execute(fftw3_plan);
        clock_gettime(CLOCK_MONOTONIC, &fftw3_end_time);
        // 计算运行时间，单位为纳秒
        fftw3_time += (fftw3_end_time.tv_sec - fftw3_start_time.tv_sec) * 1000000000LL + (fftw3_end_time.tv_nsec - fftw3_start_time.tv_nsec);

        //MKL输出数据
        MKL_LONG dim_sizes[2] = {width, height};
        MKL_Complex8* mkl_output = (MKL_Complex8*)malloc((width / 2 + 1) * height * sizeof(MKL_Complex8));
        //设置步长
        MKL_LONG input_stride[3] = {0, width, 1 };
        MKL_LONG output_stride[3] = {0, width / 2 + 1, 1};
        //设置oneMKL参数
        DFTI_DESCRIPTOR_HANDLE handle = NULL;
        DftiCreateDescriptor(&handle, DFTI_SINGLE, DFTI_REAL, 2, dim_sizes);
        DftiSetValue(handle, DFTI_PLACEMENT, DFTI_NOT_INPLACE); //设置非原地变换
        DftiSetValue(handle, DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX); //设置共轭存储
        DftiSetValue(handle, DFTI_INPUT_STRIDES, input_stride);
        DftiSetValue(handle, DFTI_OUTPUT_STRIDES, output_stride);
        DftiCommitDescriptor(handle);

        //oneMKL计算
        struct timespec mkl_start_time, mkl_end_time;
        clock_gettime(CLOCK_MONOTONIC, &mkl_start_time);
        DftiComputeForward(handle, input, mkl_output);
        clock_gettime(CLOCK_MONOTONIC, &mkl_end_time);
        mkl_time += (mkl_end_time.tv_sec - mkl_start_time.tv_sec) * 1000000000LL + (mkl_end_time.tv_nsec - mkl_start_time.tv_nsec);
        
        //比较不同计算方法结果
        int notmatch_count = 0;
        for(int j = 0; j < (width / 2 + 1) * height; j++){
            float real_diff = fabs(fftw3_output[j][0] - mkl_output[j].real);
            float imag_diff = fabs(fftw3_output[j][1] - mkl_output[j].imag);
            if(real_diff > 1e-6 || imag_diff > 1e-6){
                notmatch_count++;
            }
        }

        if(notmatch_count > 0){
            printf("第%d轮计算结果不正确! %d \n", i, notmatch_count);
            not_correct_count++;
        }
        
        //释放资源
        fftwf_destroy_plan(fftw3_plan);
        fftwf_free(fftw3_output);
        DftiFreeDescriptor(&handle);
        free(input);
        free(mkl_output);
    }
    
    if(not_correct_count == 0){
        printf("%d轮计算结果全部正确\n", iteration);
    }
    printf("FFTW3  FFT 平均运行时间：%lld 纳秒\n", fftw3_time / iteration);
    printf("OneMKL FFT 平均运行时间：%lld 纳秒\n", mkl_time / iteration);
}