CUDA官方库函数：傅里叶变换（CUFFT）函数的介绍及使用方法

最新推荐文章于 2025-03-23 12:07:27 发布

置顶杨咩咩ing

最新推荐文章于 2025-03-23 12:07:27 发布

阅读量5.6k

点赞数 52

文章标签： c++ matlab gpu算力

本文链接：https://blog.csdn.net/zy4213/article/details/136017680

版权

本文主要在前文的基础上进一步分析CUFFT函数库的使用方法，并利用matlab对运算结果做了对比。

CUFFT函数的使用流程

下面对cuFFT库的使用流程做以下阐述并且不失一般性：

变量内存分配及初始化
创建并配置cuFFT句柄：首先使用cufftHandle创建cuFFT句柄，然后调用函数cufftPlan1D()、cufftPlan2D()配置参数，其中cufftPlan1D()表示对一维向量执行FFT/IFFT操作，cufftPlan2D()表示先对矩阵的行执行FFT/IFFT操作，再对矩阵的列执行FFT/IFFT操作。指定待执行FFT操作的信号输入与输出类型，其中CUFFT_C2C表示输入输出均为复数，CUFFT_C2R表示输入复数输出实数等四类。
执行FFT策略：使用cufftExecC2C()函数执行FFT运算，此函数可以通过参数指定执行傅里叶变换(CUFFT_FORWARD)或逆傅里叶变换(CUFFT_INVERSE)。
销毁句柄：调用cufftDestroy()函数实现句柄销毁功能。

CUFFT函数的使用示例及对比

CUDA官方函数库：傅里叶变换（CUFFT）相关库函数介绍https://blog.csdn.net/zy4213/article/details/136220080?spm=1001.2014.3001.5502和CUDA官方函数库：傅里叶变换（CUFFT）相关库函数使用示例https://blog.csdn.net/zy4213/article/details/136288489?spm=1001.2014.3001.5502 两篇文章对CUFFT的库函数的使用方法做了介绍并给出了示例，但是并没有做更一步的分析，下面根据自己的了解，以代码及实际运行结果的形式对CUFFT函数做进一步分析。

1、数据生成

首先利用matlab生成1024个随机数据并以单精度形式存入数据文件“inData_cufftComplex1”，方便后续数据对比使用。

clc
clear all
close all;

iLen=1024;

%%
if(1)
    j=1;
    for i=1:1:iLen
        ccInData(j)=rand();
        ccInData(j+1)=rand();
        j=j+2;
    end
    filename=fopen('data/inData_cufftComplex1.dat','w');
    fwrite(filename,ccInData,'float32');
    fclose(filename);
    ccInData_complex=complex(ccInData(1:2:end),ccInData(2:2:end));
%     ccInDataFFT=fft(ccInData_complex);
end

2、一维行矩阵傅里叶变换

首先利用cufftExecC2C()函数执行一维行矩阵的傅里叶变换。并将计算结果写入"savaData_cufftComple.dat"文件保存，方便matlab读取并做对比。

#include"stdio.h"
#include"cuda.h"
#include <cufft.h>
void readFileComplex(cufftComplex* matrix,int iReadLen);
void writeFileComplex(cufftComplex *matrix,int iWriteLen);
int main(int argc, char **argv) {

	int col = 1;
	int row = 1024;
	int length = row * col;
	//主机端(CPU)变量定义及初始化
	cufftComplex *inputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	cufftComplex *outputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	memset(inputH, 0, sizeof(cufftComplex) * length);
	memset(outputH, 0, sizeof(cufftComplex) * length);
	//设备端(GPU)变量定义及初始化
	cufftComplex *inputD;
	cudaMalloc((void**) &inputD, length * sizeof(cufftComplex));
	cufftComplex *outputD;
	cudaMalloc((void**) &outputD, length * sizeof(cufftComplex));
	cudaMemset(inputD, 0, length * sizeof(cufftComplex));
	cudaMemset(outputD, 0, length * sizeof(cufftComplex));
	//读取二进制文件中的数据
	readFileComplex(inputH,length);
	//将数据从主机端复制至设备端
	cudaMemcpy(inputD, inputH, sizeof(cufftComplex) * length, cudaMemcpyHostToDevice);

	//1：创建用于FFT变换的句柄
	cufftHandle plan1d;
	//2：配置信号长度及输入输出数据类型信息
	cufftPlan1d(&plan1d, row, CUFFT_C2C, col);
	//3：调用函数执行或正或逆傅里叶变换
	cufftExecC2C(plan1d, inputD, outputD, CUFFT_FORWARD);
	//将数据从设备端复制至主机端
	cudaMemcpy(outputH, outputD, sizeof(cufftComplex) * length, cudaMemcpyDeviceToHost);
	//将数据写入二进制文件
	writeFileComplex(outputH,length);

}
void readFileComplex(cufftComplex* matrix,int iReadLen){
	FILE* fp;
	if ((fp = fopen("inData_cufftComplex.dat", "rb")) == NULL) {
		printf("file open failed\n");
	} else {
		fread(matrix , sizeof(cufftComplex), iReadLen, fp);
	}
}
void writeFileComplex(cufftComplex *matrix,int iWriteLen){
	FILE* fp;
	if((fp=fopen("saveData_cufftComplex.dat","wb"))==NULL){
		printf("file open failed\n");
	}else{
		fwrite(matrix, sizeof(cufftComplex) , iWriteLen, fp);
	}
}

编译方法：

//test.cu代表文件名，test代表可执行文件名
nvcc test.cu -o test -lcufft

编译通过之后执行可执行文件即可得到运行结果：

./test

利用matlab读取生成的结果文件并对比

clc
clear all
close all;

row=1;
col=1024;
len=row*col;
input=zeros(1,len);
for i=1:1:len
    input(i)=complex(i-1,i);
end
output=fft(input);

filename='data/inData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData=fread(fileID,'float32');
ccInData_complex=complex(ccInData(1:2:end),ccInData(2:2:end));
%对输入信号做FFT运算
ccInDataFFT=fft(ccInData_complex);             %求fft


filename='data/saveData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData_GPU=fread(fileID,'float32');
ccInData_GPU_Complex=complex(ccInData_GPU(1:2:end),ccInData_GPU(2:2:end));

变量ccInDataFFT即为matlab对本段数据做fft之后的结果，ccInData_GPU_Complex为GPU卡通过cufft函数做fft之后的结果，在matlab运行可以对比看出两者运算结果一致，仅存在精度上的差异（可忽略不计）。本次对比未能提供对比结果图，请读者自行测试。

3、二维矩阵对列数据傅里叶变换

由于GPU的数据存储是按照读入数据顺序存储，因此即使是二维矩阵的数据在GPU内也是以一维的形式存储，因此，需要示例说明在对二维矩阵做傅里叶变换时，参数的设定对傅里叶变换的影响。主要与cufftPlan1d(&plan1d, row,CUFFT_C2C,col)的第二个和第四个参数有关;具体见以下程序。

#include"stdio.h"
#include"cuda.h"
#include <cufft.h>
void readFileComplex(cufftComplex* matrix,int iReadLen);
void writeFileComplex(cufftComplex *matrix,int iWriteLen);
int main(int argc, char **argv) {

	int col = 8;
	int row = 128;
	int length = row * col;
	//主机端(CPU)变量定义及初始化
	cufftComplex *inputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	cufftComplex *outputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	memset(inputH, 0, sizeof(cufftComplex) * length);
	memset(outputH, 0, sizeof(cufftComplex) * length);
	//设备端(GPU)变量定义及初始化
	cufftComplex *inputD;
	cudaMalloc((void**) &inputD, length * sizeof(cufftComplex));
	cufftComplex *outputD;
	cudaMalloc((void**) &outputD, length * sizeof(cufftComplex));
	cudaMemset(inputD, 0, length * sizeof(cufftComplex));
	cudaMemset(outputD, 0, length * sizeof(cufftComplex));
	//读取二进制文件中的数据
	readFileComplex(inputH,length);
	//将数据从主机端复制至设备端
	cudaMemcpy(inputD, inputH, sizeof(cufftComplex) * length, cudaMemcpyHostToDevice);

	//1：创建用于FFT变换的句柄
	cufftHandle plan1d;
	//2：配置信号长度及输入输出数据类型信息
	cufftPlan1d(&plan1d,  row,CUFFT_C2C,col);
	//3：调用函数执行或正或逆傅里叶变换
	cufftExecC2C(plan1d, inputD, outputD, CUFFT_FORWARD);
	//将数据从设备端复制至主机端
	cudaMemcpy(outputH, outputD, sizeof(cufftComplex) * length, cudaMemcpyDeviceToHost);
	//将数据写入二进制文件
	writeFileComplex(outputH,length);
	for(int i=0;i<5;i++){
		printf("output.x=%f\n",outputH[i].x);
		printf("output.y=%f\n",outputH[i].y);
	}
void readFileComplex(cufftComplex* matrix,int iReadLen){
	FILE* fp;
	if ((fp = fopen("inData_cufftComplex.dat", "rb")) == NULL) {
		printf("file open failed\n");
	} else {
		fread(matrix , sizeof(cufftComplex), iReadLen, fp);
	}
}
void writeFileComplex(cufftComplex *matrix,int iWriteLen){
	FILE* fp;
	if((fp=fopen("saveData_cufftComplex.dat","wb"))==NULL){
		printf("file open failed\n");
	}else{
		fwrite(matrix, sizeof(cufftComplex) , iWriteLen, fp);
	}
}

编译方法：

//test.cu代表文件名，test代表可执行文件名
nvcc test.cu -o test -lcufft

编译通过之后执行可执行文件即可得到运行结果：

./test

利用matlab读取生成的结果文件并对比

clc
clear all
close all;

row=1;
col=1024;
len=row*col;
input=zeros(1,len);
for i=1:1:len
    input(i)=complex(i-1,i);
end
output=fft(input);

filename='data/inData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData=fread(fileID,'float32');
ccInData_complex1=complex(ccInData(1:2:end),ccInData(2:2:end));
ccInData_complex=reshape(ccInData_complex1,128,8);  %变为128行8列
%对输入信号做FFT运算
ccInDataFFT=fft(ccInData_complex,[],1);              %对每一列数据做fft


filename='data/saveData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData_GPU=fread(fileID,'float32');
ccInData_GPU_Complex=complex(ccInData_GPU(1:2:end),ccInData_GPU(2:2:end));
ccInData_GPU_Complex=reshape(ccInData_GPU_Complex,128,8);

对比运行结果如下：

其中左边红框数据为GPU卡通过cufft函数对二维矩阵做fft之后的结果，右边红框为matlab矩阵灭一列做fft的结果，两个变量数据一致，以此来说明二维矩阵中cufftPlan1d()函数的使用方法。

4、二维矩阵行列傅里叶变换

前面介绍了cufftPlan1d()函数对行或者列做傅里叶变换的方法，下面介绍cufftPlan2d()函数：同时对行和列做傅里叶变换。首先给出GPU实现方案。

#include"stdio.h"
#include"cuda.h"
#include <cufft.h>
void readFileComplex(cufftComplex* matrix,int iReadLen);
void writeFileComplex(cufftComplex *matrix,int iWriteLen);
int main(int argc, char **argv) {

	int col = 8;
	int row = 128;
	int length = row * col;
	//主机端(CPU)变量定义及初始化
	cufftComplex *inputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	cufftComplex *outputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	memset(inputH, 0, sizeof(cufftComplex) * length);
	memset(outputH, 0, sizeof(cufftComplex) * length);
	//设备端(GPU)变量定义及初始化
	cufftComplex *inputD;
	cudaMalloc((void**) &inputD, length * sizeof(cufftComplex));
	cufftComplex *outputD;
	cudaMalloc((void**) &outputD, length * sizeof(cufftComplex));
	cudaMemset(inputD, 0, length * sizeof(cufftComplex));
	cudaMemset(outputD, 0, length * sizeof(cufftComplex));
	//读取二进制文件中的数据
	readFileComplex(inputH,length);
	//将数据从主机端复制至设备端
	cudaMemcpy(inputD, inputH, sizeof(cufftComplex) * length, cudaMemcpyHostToDevice);

	//1：创建用于FFT变换的句柄
	cufftHandle plan2d;
	//2：配置信号长度及输入输出数据类型信息
	cufftPlan2d(&plan2d,  col, row,CUFFT_C2C);
//	cufftPlan1d(&plan1d,  row,CUFFT_C2C,col);
	//3：调用函数执行或正或逆傅里叶变换
	cufftExecC2C(plan2d, inputD, outputD, CUFFT_FORWARD);
	//将数据从设备端复制至主机端
	cudaMemcpy(outputH, outputD, sizeof(cufftComplex) * length, cudaMemcpyDeviceToHost);
	//将数据写入二进制文件
	writeFileComplex(outputH,length);
	for(int i=0;i<5;i++){
		printf("output.x=%f\n",outputH[i].x);
		printf("output.y=%f\n",outputH[i].y);
	}


}
void readFileComplex(cufftComplex* matrix,int iReadLen){
	FILE* fp;
	if ((fp = fopen("inData_cufftComplex.dat", "rb")) == NULL) {
		printf("file open failed\n");
	} else {
		fread(matrix , sizeof(cufftComplex), iReadLen, fp);
	}
}
void writeFileComplex(cufftComplex *matrix,int iWriteLen){
	FILE* fp;
	if((fp=fopen("saveData_cufftComplex.dat","wb"))==NULL){
		printf("file open failed\n");
	}else{
		fwrite(matrix, sizeof(cufftComplex) , iWriteLen, fp);
	}
}

编译方法：

//test.cu代表文件名，test代表可执行文件名
nvcc test.cu -o test -lcufft

编译通过之后执行可执行文件即可得到运行结果：

./test

然后给出matlab实现方案。

clc
clear all
close all;

row=1;
col=1024;
len=row*col;
input=zeros(1,len);
for i=1:1:len
    input(i)=complex(i-1,i);
end
output=fft(input);

filename='data/inData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData=fread(fileID,'float32');
ccInData_complex1=complex(ccInData(1:2:end),ccInData(2:2:end));
ccInData_complex=reshape(ccInData_complex1,128,8);  %变为128行8列
%对输入信号做FFT运算
% ccInDataFFT=fft(ccInData_complex,[],1);              %对每一列数据做fft

ccInDataFFT=fft(fft(ccInData_complex,[],1),[],2);              %对每一行和列数据做fft


filename='data/saveData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData_GPU=fread(fileID,'float32');
ccInData_GPU_Complex=complex(ccInData_GPU(1:2:end),ccInData_GPU(2:2:end));
ccInData_GPU_Complex=reshape(ccInData_GPU_Complex,128,8);

经过对比，两者运行结果一致。请读者自行运行代码测试。

5、矩阵的逆傅里叶变换

前面文章说明了cufft库函数在做逆傅里叶变换时并没有处理信号长度，下面给出实测证明。

首先给出GPU实现方法

#include"stdio.h"
#include"cuda.h"
#include <cufft.h>
void readFileComplex(cufftComplex* matrix,int iReadLen);
void writeFileComplex(cufftComplex *matrix,int iWriteLen);
int main(int argc, char **argv) {

	int col = 1;
	int row = 1024;
	int length = row * col;
	//主机端(CPU)变量定义及初始化
	cufftComplex *inputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	cufftComplex *outputH = (cufftComplex*) malloc(sizeof(cufftComplex) * length);
	memset(inputH, 0, sizeof(cufftComplex) * length);
	memset(outputH, 0, sizeof(cufftComplex) * length);
	//设备端(GPU)变量定义及初始化
	cufftComplex *inputD;
	cudaMalloc((void**) &inputD, length * sizeof(cufftComplex));
	cufftComplex *outputD;
	cudaMalloc((void**) &outputD, length * sizeof(cufftComplex));
	cudaMemset(inputD, 0, length * sizeof(cufftComplex));
	cudaMemset(outputD, 0, length * sizeof(cufftComplex));
	//读取二进制文件中的数据
	readFileComplex(inputH,length);
	//将数据从主机端复制至设备端
	cudaMemcpy(inputD, inputH, sizeof(cufftComplex) * length, cudaMemcpyHostToDevice);

	//1：创建用于FFT变换的句柄
	cufftHandle plan1d;
	//2：配置信号长度及输入输出数据类型信息
	cufftPlan1d(&plan1d, row, CUFFT_C2C, col);
	//3：调用函数执行或正或逆傅里叶变换
	cufftExecC2C(plan1d, inputD, outputD, CUFFT_INVERSE);
	//将数据从设备端复制至主机端
	cudaMemcpy(outputH, outputD, sizeof(cufftComplex) * length, cudaMemcpyDeviceToHost);
	//将数据写入二进制文件
	writeFileComplex(outputH,length);

}
void readFileComplex(cufftComplex* matrix,int iReadLen){
	FILE* fp;
	if ((fp = fopen("inData_cufftComplex.dat", "rb")) == NULL) {
		printf("file open failed\n");
	} else {
		fread(matrix , sizeof(cufftComplex), iReadLen, fp);
	}
}
void writeFileComplex(cufftComplex *matrix,int iWriteLen){
	FILE* fp;
	if((fp=fopen("saveData_cufftComplex.dat","wb"))==NULL){
		printf("file open failed\n");
	}else{
		fwrite(matrix, sizeof(cufftComplex) , iWriteLen, fp);
	}
}

然后给出matlab运行结果

clc
clear all
close all;

row=1;
col=1024;
len=row*col;
input=zeros(1,len);
for i=1:1:len
    input(i)=complex(i-1,i);
end
output=fft(input);

filename='data/inData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData=fread(fileID,'float32');
ccInData_complex=complex(ccInData(1:2:end),ccInData(2:2:end));
%对输入信号做FFT运算
ccInDataIFFT=ifft(ccInData_complex);             %求ifft


filename='data/saveData_cufftComplex.dat';
fileID=fopen(filename,'rb');
ccInData_GPU=fread(fileID,'float32');
ccInData_GPU_Complex=complex(ccInData_GPU(1:2:end),ccInData_GPU(2:2:end));
%通过画图的方式将通过matlab做FFT运算后的数据与GPU做FFT运算后的数据通过画图的方式做对比

ccInData_GPU_Complex_inv=ccInData_GPU_Complex./1024;

下面是对比结果：