
CUDA的cufft库可以实现(复数C-复数C),(实数R-复数C)和(复数C-实数R)的单精度,双精度福利变换。其变换前后的输入,输出数据的长度如图所示。在C2R和R2C模式中,根据埃尔米特对称性(Hermitian symmetry),变换后,*代表共轭复数。CUFFT的傅里叶变换类型则利用了这些冗余,将计算量降到最低。





using namespace std;
__global__ void normalizing(cufftComplex* data, int data_len)
	int idx = blockDim.x*blockIdx.x + threadIdx.x;
	data[idx].x /= data_len;
	data[idx].y /= data_len;
void Check(cudaError_t status)
	if (status != cudaSuccess)
		cout << "行号:" << __LINE__ << endl;
		cout << "错误:" << cudaGetErrorString(status) << endl;
int main()
	const int Nt = 256;
	const int BATCH = 1;
	cufftComplex* host_in, *host_out, *device_in, *device_out;
	Check(cudaMallocHost((void**)&host_in, Nt * sizeof(cufftComplex)));
	Check(cudaMallocHost((void**)&host_out, Nt * sizeof(cufftComplex)));
	for (int i = 0; i < Nt; i++)
		host_in[i].x = i + 1;
		host_in[i].y = i + 1;
	Check(cudaMalloc((void**)&device_in, Nt * sizeof(cufftComplex)));
	Check(cudaMalloc((void**)&device_out, Nt * sizeof(cufftComplex)));
	Check(cudaMemcpy(device_in, host_in, Nt * sizeof(cufftComplex), cudaMemcpyHostToDevice));

	cufftHandle cufftForwrdHandle, cufftInverseHandle;
	cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_C2C, BATCH);
	cufftPlan1d(&cufftInverseHandle, Nt, CUFFT_C2C, BATCH);

	cufftExecC2C(cufftForwrdHandle, device_in, device_out, CUFFT_FORWARD);

	Check(cudaMemcpy(host_out, device_out, Nt * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

	cout << "正变换结果:" << endl;
	for (int i = 0; i < Nt; i++)
		cout << host_out[i].x << "+j*" << host_out[i].y << endl;

	cufftExecC2C(cufftInverseHandle, device_out, device_in, CUFFT_INVERSE);

	dim3 grid(Nt / 128);
	dim3 block(128);
	normalizing << <grid, block >> > (device_in, Nt);

	Check(cudaMemcpy(host_in, device_in, Nt * sizeof(cufftComplex), cudaMemcpyDeviceToHost));

	cout << "反变换结果:" << endl;
	for (int i = 0; i < Nt; i++)
		cout << host_in[i].x << "+j*" << host_in[i].y << endl;
	return 0;


using namespace std;
__global__ void normalizing(cufftDoubleComplex* data,int data_len)
	int idx = blockDim.x*blockIdx.x + threadIdx.x;
	data[idx].x /= data_len;
	data[idx].y /= data_len;
void Check(cudaError_t status)
	if (status != cudaSuccess)
		cout << "行号:" << __LINE__ << endl;
		cout << "错误:" << cudaGetErrorString(status) << endl;
int main()
	const int Nt =256;
	const int BATCH = 1;
	cufftDoubleComplex* host_in, *host_out, *device_in, *device_out;
	Check(cudaMallocHost((void**)&host_in, Nt * sizeof(cufftDoubleComplex)));
	Check(cudaMallocHost((void**)&host_out, Nt * sizeof(cufftDoubleComplex)));
	for (int i = 0; i < Nt; i++)
		host_in[i].x = i + 1;
		host_in[i].y = i + 1;
	Check(cudaMalloc((void**)&device_in, Nt * sizeof(cufftDoubleComplex)));
	Check(cudaMalloc((void**)&device_out, Nt * sizeof(cufftDoubleComplex)));
	Check(cudaMemcpy(device_in, host_in, Nt * sizeof(cufftDoubleComplex), cudaMemcpyHostToDevice));

	cufftHandle cufftForwrdHandle, cufftInverseHandle;
	cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_Z2Z, BATCH);
	cufftPlan1d(&cufftInverseHandle, Nt, CUFFT_Z2Z, BATCH);

	cufftExecZ2Z(cufftForwrdHandle, device_in, device_out, CUFFT_FORWARD);

	Check(cudaMemcpy(host_out, device_out, Nt * sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));

	cout << "正变换结果:" << endl;
	for (int i = 0; i < Nt; i++)
		cout << host_out[i].x<< "+j*" << host_out[i].y << endl;

	cufftExecZ2Z(cufftInverseHandle,  device_out, device_in, CUFFT_INVERSE);
	dim3 grid(Nt/128); 
	dim3 block(128);
	normalizing << <grid, block >> > (device_in,Nt);

	Check(cudaMemcpy(host_in, device_in, Nt * sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));

	cout << "反变换结果:" << endl;
	for (int i = 0; i < Nt; i++)
		cout << host_in[i].x << "+j*" << host_in[i].y << endl;
	return 0;

(2)R2C(D2Z)模式-双精度(单精度类似)&&C2R(Z2D)模式双精度(单精度类似)  包括正反变换

using namespace std;
#define Check(call)														\
{																		\
	cudaError_t status = call;											\
	if (status != cudaSuccess)											\
	{																	\
		cout << "行号:" << __LINE__ << endl;							\
		cout << "错误:" << cudaGetErrorString(status) << endl;			\
	}																	\

__global__ void normalizing(cufftDoubleReal* data, int data_len)
	int idx = blockDim.x*blockIdx.x + threadIdx.x;
	if (idx<data_len)
		data[idx] /=(data_len);

int main()
	const int Nt =512;
	const int BATCH = 1;
	cufftDoubleReal* host_in,  *device_in;
	cufftDoubleComplex* host_out, *device_out;
	Check(cudaMallocHost((void**)&host_in, Nt * sizeof(cufftDoubleReal)));
	Check(cudaMallocHost((void**)&host_out, (Nt / 2 + 1) * sizeof(cufftDoubleComplex)));
	for (int i = 0; i < Nt; i++)
		host_in[i] = i + 1;
	Check(cudaMalloc((void**)&device_in, Nt * sizeof(cufftDoubleReal)));
	Check(cudaMalloc((void**)&device_out, (Nt / 2 + 1) * sizeof(cufftDoubleComplex)));
	Check(cudaMemcpy(device_in, host_in, Nt * sizeof(cufftDoubleReal), cudaMemcpyHostToDevice));

	cufftHandle cufftForwrdHandle, cufftInverseHandle;
	cufftPlan1d(&cufftForwrdHandle, Nt, CUFFT_D2Z, BATCH);
	cufftPlan1d(&cufftInverseHandle, Nt, CUFFT_Z2D, BATCH);

	cufftExecD2Z(cufftForwrdHandle, device_in, device_out);//由于D2Z的方向是固定的,无需填入参数

	Check(cudaMemcpy(host_out, device_out, (Nt/2+1) * sizeof(cufftDoubleComplex), cudaMemcpyDeviceToHost));

	cout << "正变换结果:" << endl;
	for (int i = 0; i < (Nt / 2 + 1); i++)
		cout << host_out[i].x << "+j*" << host_out[i].y << endl;

	cufftExecZ2D(cufftInverseHandle, device_out, device_in);//由于Z2D的方向是固定的,无需填入参数
	dim3 grid(ceil((Nt / 2 + 1) / 128.0) + 1);
	dim3 block(128);
	normalizing << <grid, block >> > (device_in, Nt);

	Check(cudaMemcpy(host_in, device_in, Nt * sizeof(cufftDoubleReal), cudaMemcpyDeviceToHost));

	cout << "反变换结果:" << endl;
	for (int i = 0; i < Nt; i++)
		cout << host_in[i] << endl;
	return 0;

This document describes CUFFT, the NVIDIA® CUDA™ Fast Fourier Transform (FFT) library. The FFT is a divide-and-conquer algorithm for efficiently computing discrete Fourier transforms of complex or real-valued data sets. It is one of the most important and widely used numerical algorithms in computational physics and general signal processing. The CUFFT library provides a simple interface for computing parallel FFTs on an NVIDIA GPU, which allows users to leverage the floating-point power and parallelism of the GPU without having to develop a custom, CUDA FFT implementation. FFT libraries typically vary in terms of supported transform sizes and data types. For example, some libraries only implement radix-2 FFTs, restricting the transform size to a power of two. The CUFFT Library aims to support a wide range of FFT options efficiently on NVIDIA GPUs. This version of the CUFFT library supports the following features: I Complex and real-valued input and output I 1D, 2D, and 3D transforms I Batch execution for doing multiple transforms of any dimension in parallel I Transform sizes up to 64 million elements in single precision and up to 128 million elements in double precision in any dimension, limited by the available GPU memory I In-place and out-of-place transforms I Double-precision (64-bit floating point) on compatible hardware (sm1.3 and later) I Support for streamed execution, enabling asynchronous computation and data movement I FFTW compatible data layouts I Arbitrary intra- and inter-dimension element strides I Thread-safe API that can be called from multiple independent host threads
评论 3




当前余额3.43前往充值 >
领取后你会自动成为博主和红包主的粉丝 规则
钱包余额 0


