Github上某大佬的文章指出,Intel下的onemkL函数库对于大型矩阵的计算很厉害,如下图,在运算性能以及速度上优于诸多方法,本文旨在通过自己亲身实践,学习不同算法函数库用以求解二维实数到复数的快速傅里叶变换,文章最后会附带上全部源代码。(这是本人的第一篇文章,欢迎大家共同交流学习,共勉。)
oneMKL和FFTW3介绍
oneMKL
FFTW3
FFTW3(Fastest Fourier Transform in the West)是一个开源的软件库,用于计算快速傅里叶变换(FFT)。它提供了高效的算法和实现,用于在数字信号处理、图像处理、模拟和科学计算等领域进行频率域分析。FFTW3被广泛应用于各种编程语言和平台上。它提供了多种不同的FFT变换算法,以适应不同的数据大小和性能需求。
实验环境配置
处理器:Intel i5
虚拟机:vwmare17,乌班图18.04(推荐使用,个人觉得比较稳定,出了问题网上可查的解决方法也较多,不过也不必纠结于版本,能跑就行)
代码实现
首先,调用oneMKL封装的函数,随机生成2048*2048大小的单精度浮点数:
VSLStreamStatePtr stream;
vslNewStream(&stream,VSL_BRNG_MT19937,1);
vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD,stream,Nx*Nz,input,0.0f,1.0f);
vslDeleteStream(&stream);
使用FFTW3计算real to imag FFT2D:
fftwf_complex *outputFFTW=(fftwf_complex *)fftwf_malloc(sizeof(fftwf_complex)*Nx*(Nz/2+1));
fftwf_plan status=fftwf_plan_dft_r2c_2d(Nx,Nz,input,outputFFTW,FFTW_MEASURE);
t0=wallclock_time();
fftwf_execute(status);
t1=wallclock_time();
其中,wallclock_time()是自定义的时间函数,如下:
double wallclock_time(void)
{
struct timeval s_val;
static struct timeval b_val;
double time;
static int base=0;
gettimeofday(&s_val,0);
if (!base) {
b_val = s_val;
base = 1;
return 0.0;
}
time = (double)(s_val.tv_sec-b_val.tv_sec) +
(double)(1e-6*((double)s_val.tv_usec-(double)b_val.tv_usec));
return (double)time;
}
使用oneMKL计算real to imag FFT2D:
MKL_LONG rs[3]={0,Nx,1};
MKL_LONG cs[3]={0,Nz/2+1,1};
MKL_Complex8 *outputMKL=(MKL_Complex8 *)malloc(sizeof(MKL_Complex8)*Nx*(Nz/2+1));
DFTI_DESCRIPTOR_HANDLE handle;
MKL_LONG sizeN[2]={Nx,Nz};
DftiCreateDescriptor(&handle,DFTI_SINGLE,DFTI_REAL,2,sizeN);
DftiSetValue(handle,DFTI_PLACEMENT,DFTI_NOT_INPLACE);
DftiSetValue(handle,DFTI_CONJUGATE_EVEN_STORAGE,DFTI_COMPLEX_COMPLEX);
DftiSetValue(handle,DFTI_INPUT_STRIDES,rs);
DftiSetValue(handle,DFTI_OUTPUT_STRIDES,cs);
DftiCommitDescriptor(handle);
t0=wallclock_time();
DftiComputeForward(handle,input,outputMKL);
t1=wallclock_time();
比较残差以及运算性能:
if(Nx==Nz)
{
printf("-------------------------\n");
diff=0;
for(i=0;i<Nx*(Nz/2+1);i++)
{
diff1=(double)fabs(outputMKL[i].real-outputFFTW[i][0]);
diff2=(double)fabs(outputMKL[i].imag-outputFFTW[i][1]);
if(diff1>0.000001 || diff2>0.000001)
{
diff=1;
break;
}
}
if(diff==0) fprintf(stderr," CORRECT (residual < 0.000001) bwetween FFTW3 with oneMKL\n");
else fprintf(stderr," ERROR (residual >0.000001 ) bwetween FFTW3 with oneMKL\n");
}
最终结果:
可以发现,oneMKL精度在0.000001数量级下与开源的FFTW3相同,同时性能更加优秀。(不同电脑配置加速率不一样)
全部源代码
按照如下方法编译,这要求你的环境上正确安装编译器。
icx fft_2d_r2c_fftw_and_onemkl.c -o fft_2d_r2c_fftw_and_onemkl -qmkl -std=c99 -lmkl_rt -lm
#include<stdio.h>
#include<stdlib.h>
#include<math.h>
#include<time.h>
#include<sys/time.h>
#include<stdbool.h>
#include<string.h>
#include<sys/resource.h>
#include<fftw3.h>
#include<mkl.h>
#include"mkl_vsl.h"
#include"mkl_service.h"
#include"mkl_dfti.h"
size_t getCurrentMemoryUsage();
double wallclock_time(void);
int main(int argc,char *argv[])
{
int i,Nx,Nz;
double t0,t1,t,k,diff1,diff2,diff;
size_t memoryUsage;
Nx=2048;
Nz=2048;
//N2=16;
k=log(Nz)/log(2)+1;
while(Nz<=Nx)
{
float *input=(float *)malloc(sizeof(float)*Nx*Nz);
/*Random numbers*/
VSLStreamStatePtr stream;
vslNewStream(&stream,VSL_BRNG_MT19937,1);
vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD,stream,Nx*Nz,input,0.0f,1.0f);
vslDeleteStream(&stream);
/*Use FFTW3 to do FFT2d*/
fftwf_complex *outputFFTW=(fftwf_complex *)fftwf_malloc(sizeof(fftwf_complex)*Nx*(Nz/2+1));
fftwf_plan status=fftwf_plan_dft_r2c_2d(Nx,Nz,input,outputFFTW,FFTW_MEASURE);
t0=wallclock_time();
fftwf_execute(status);
t1=wallclock_time();
t=t1-t0;
fprintf(stderr," FFTW Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
if(Nx==Nz)
{
t0=wallclock_time();
for(i=0;i<1000;i++) fftwf_execute(status);
t1=wallclock_time();
t=t1-t0;
fprintf(stderr," 1000 times FFTW Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
}
/*Use oneMKL to do FFT2d*/
MKL_LONG rs[3]={0,Nx,1};
MKL_LONG cs[3]={0,Nz/2+1,1};
MKL_Complex8 *outputMKL=(MKL_Complex8 *)malloc(sizeof(MKL_Complex8)*Nx*(Nz/2+1));
DFTI_DESCRIPTOR_HANDLE handle;
MKL_LONG sizeN[2]={Nx,Nz};
DftiCreateDescriptor(&handle,DFTI_SINGLE,DFTI_REAL,2,sizeN);
DftiSetValue(handle,DFTI_PLACEMENT,DFTI_NOT_INPLACE);
DftiSetValue(handle,DFTI_CONJUGATE_EVEN_STORAGE,DFTI_COMPLEX_COMPLEX);
DftiSetValue(handle,DFTI_INPUT_STRIDES,rs);
DftiSetValue(handle,DFTI_OUTPUT_STRIDES,cs);
DftiCommitDescriptor(handle);
t0=wallclock_time();
DftiComputeForward(handle,input,outputMKL);
t1=wallclock_time();
t=t1-t0;
fprintf(stderr," oneMKL Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
if(Nx==Nz)
{
t0=wallclock_time();
for(i=0;i<1000;i++) DftiComputeForward(handle,input,outputMKL);
t1=wallclock_time();
t=t1-t0;
fprintf(stderr," 1000 times oneMKL Plan N=(%-5d * %-5d) time is %f s\n",Nx,Nz,t);
}
/*Compare oneMKL and FFTW3*/
if(Nx==Nz)
{
printf("-------------------------\n");
diff=0;
for(i=0;i<Nx*(Nz/2+1);i++)
{
diff1=(double)fabs(outputMKL[i].real-outputFFTW[i][0]);
diff2=(double)fabs(outputMKL[i].imag-outputFFTW[i][1]);
if(diff1>0.000001 || diff2>0.000001)
{
diff=1;
break;
}
}
if(diff==0) fprintf(stderr," CORRECT (residual < 0.000001) bwetween FFTW3 with oneMKL\n");
else fprintf(stderr," ERROR (residual >0.000001 ) bwetween FFTW3 with oneMKL\n");
}
free(input);
free(outputMKL);
DftiFreeDescriptor(&handle);
//fftwf_destory_plan(status);
fftwf_free(outputFFTW);
Nz=pow(2.0,k);
k+=1.0;
}
memoryUsage = getCurrentMemoryUsage();
fprintf(stderr," CPU usage size : %zu KB \n",memoryUsage);
return 0;
}
size_t getCurrentMemoryUsage(){
struct rusage usage;
getrusage(RUSAGE_SELF, &usage);
return usage.ru_maxrss;
}
double wallclock_time(void)
{
struct timeval s_val;
static struct timeval b_val;
double time;
static int base=0;
gettimeofday(&s_val,0);
if (!base) {
b_val = s_val;
base = 1;
return 0.0;
}
time = (double)(s_val.tv_sec-b_val.tv_sec) +
(double)(1e-6*((double)s_val.tv_usec-(double)b_val.tv_usec));
return (double)time;
}