用cufft如下例
#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// Include CUDA runtime and CUFFT
#include <cuda_runtime.h>
#include <cufft.h>
// Helper functions for CUDA
#include <helper_functions.h>
#include <helper_cuda.h>
#define SIGNAL_SIZE 10
int main()
{
StopWatchInterface *hTimer = NULL;
sdkCreateTimer(&hTimer);
sdkResetTimer(&hTimer);
sdkStartTimer(&hTimer);
cufftComplex
*d_signalsource,
*d_signalresult,
*h_signalsource,
*h_signalresult;
//为host信号分配空间和初始化
h_signalsource = (cufftComplex *)malloc(sizeof(cufftComplex)*SIGNAL_SIZE);
h_signalresult = (cufftComplex *)malloc(sizeof(cufftComplex)*SIGNAL_SIZE);
for (int i = 0;i<SIGNAL_SIZE;i++)
{
h_signalsource[i].x = i;
h_signalsource[i].y = i;
}
//在gpu上为信号分配空间
checkCudaErrors(cudaMalloc((void **)&d_signalsource,sizeof(cufftComplex)*SIGNAL_SIZE));
checkCudaErrors(cudaMalloc((void **)&d_signalresult,sizeof(cufftComplex)*SIGNAL_SIZE));
//将信号从内存拷贝到显存
checkCudaErrors(cudaMemcpy(d_signalsource,h_signalsource,sizeof(cufftComplex)*SIGNAL_SIZE,cudaMemcpyHostToDevice));
//创建cufft句柄
cufftHandle plan;
checkCudaErrors(cufftPlan1d(&plan,SIGNAL_SIZE,CUFFT_C2C,1));
//执行cufft
checkCudaErrors(cufftExecC2C(plan,d_signalsource,d_signalresult,CUFFT_FORWARD));
/*
* Results may not be immediately available so block device until all
*/
checkCudaErrors(cudaDeviceSynchronize());
//将结果从显存拷贝到内存
checkCudaErrors(cudaMemcpy(h_signalresult,d_signalresult,sizeof(cufftComplex)*SIGNAL_SIZE,cudaMemcpyDeviceToHost));
//输出结果
for (int i = 0;i<SIGNAL_SIZE;i++)
{
printf("%f+%fi\n",h_signalresult[i].x,h_signalresult[i].y);
}
//销毁cufft句柄,释放显存
cufftDestroy(plan);
cudaFree(d_signalsource);
cudaFree(d_signalresult);
sdkStopTimer(&hTimer);
double gpuTime = sdkGetTimerValue(&hTimer);
printf("gputime:%lf\n",gpuTime);
getchar();
return 0;
}
内存和显存上的开辟空间,内容复制等步骤都是必须的。
关于batch:
比如上例中一维fft,信号长度为10点,把plan改写为
checkCudaErrors(cufftPlan1d(&plan,5,CUFFT_C2C,2));
就会并行地做前五点与后五点各自的5点fft,所以,若要同时做m个n点fft(复数),则输入信号长度为(cufftcomplex *)*m*n,plan则是
cufftPlan1d(&plan,n,CUFFT_C2C,m)
关于batch的限制等等,待查文档