基2FFT比非基2FFT的运行速度要快,在NVIDIA GPU设备上使用FFT的时候也是尽量使用基2FFT,因为本身使用GPU就是为了追求快速计算。测试了下一个二维复数矩阵在基2FFT和非基2FFT的性能差别(二维矩阵中的一个维度进行FFT,用到了batch)。从我测试的结果来看,基2FFT比非基2FFT快5倍。测试的数据尺寸:基2FFT:512×16384,非基2FFT:428×16384,测试数据是用matlab生成的。
- %生成用于cufft测试的随机数(使用matlab生成的,也可以在GPU程序中生成)
- close all;
- clear; clc;
- a1 = randn(428 * 2, 1); %生成428点长度的随机数, 复数据,按IQ存储
- a2 = zeros(428 * 2, 16384);
- for ii = 1 : 16384,
- a2(:, ii) = a1;
- end,
- clear a1;
- b1 = randn(512 * 2, 1);
- b2 = zeros(512 * 2, 16384);
- for ii = 1 : 16384,
- b2(:, ii) = b1;
- end,
- clear b1;
- %将生成的随机数写到硬盘,以用于c程序调用
- output_file_428='D:/cufft_test/428.dat';
- output_file_512='D:/cufft_test/512.dat';
- fid1 = fopen(output_file_428, 'w');
- fid2 = fopen(output_file_512, 'w');
- fwrite(fid1, a2, 'single');
- fwrite(fid2, b2, 'single');
- fclose(fid1);
- fclose(fid2);
GPU测试程序:
- #include <math.h>
- #include <stdlib.h>
- #include <time.h>
- #include <stdio.h>
- #include <memory.h>
- #include <string.h>
- #include <omp.h> //使用openmp开启了CPU并行
- using namespace std;
- //设备端头文件
- #include <cufft.h>
- #include <cutil_inline.h>
- typedef float2 Complex; //复数类型
- #define datasize_428 7012352
- #define datasize_512 8388608
- char data_file_428[200] = "D://cufft_test//428.dat";
- char data_file_512[200] = "D://cufft_test//512.dat";
- int main(int argc, char* argv[])
- {
- clock_t start, end, start1, end1, start2, end2;
- time_t timer;
- struct tm *tblock;
- /* gets time of day */
- timer = time(NULL);
- /* converts date/time to a structure */
- tblock = localtime(&timer);
- start = clock();
- system("cls");
- system("color 0a");
- printf("Local time: %s", asctime(tblock));
- cudaSetDevice(cutGetMaxGflopsDeviceId());
- int devID;
- cudaDeviceProp props;
- //get number of SMs on this GPU
- cutilSafeCall(cudaGetDevice(&devID));
- cutilSafeCall(cudaGetDeviceProperties(&props, devID));
- printf("Device %d: /"%s/" with Compute %d.%d capability/n", devID, props.name, props.major, props.minor);
- cudaSetDevice(cutGetMaxGflopsDeviceId());
- //读取428点的测试数据
- Complex *echodata_428 = new Complex[datasize_428]; //存放回波复数据
- float *echo_iq_428 = new float[2 * datasize_428]; //用于读取存在硬盘上的IQ回波
- FILE *fp_echo_428;
- int ncount = 0; //用于数据校验
- printf("Now start to read float type echo data from disk/n");
- fp_echo_428 = fopen(data_file_428, "rb");
- if (fp_echo_428 == NULL)
- {
- printf("Open echo data file fails!/n");
- }
- else
- {
- ncount = fread(echo_iq_428, sizeof(float), 2 * datasize_428, fp_echo_428);
- if (ncount != 2 * datasize_428)
- {
- printf("Read data from disk error, not completely!/n");
- }
- else
- {
- printf("Read echo data sucesses/n");
- fclose(fp_echo_428);
- }
- }
- printf("now, start to transform the [I Q] data to complex type/n");
- int i;
- #pragma omp parallel for num_threads(8) private(i)
- for (i = 0; i < datasize_428; i++)
- {
- echodata_428[i].x = echo_iq_428[2 * i];
- echodata_428[i].y = echo_iq_428[2 * i + 1];
- }
- #pragma omp barrier
- /*free(echo_iq);
- echo_iq = NULL;*/
- printf("transform the [I Q] echo data to complex type successes/n");
- //读取512点的测试数据
- Complex *echodata_512 = new Complex[datasize_512]; //存放回波复数据
- float *echo_iq_512 = new float[2 * datasize_512]; //用于读取存在硬盘上的IQ回波
- FILE *fp_echo_512;
- ncount = 0; //用于数据校验
- printf("Now start to read float type echo data from disk/n");
- fp_echo_512 = fopen(data_file_512, "rb");
- if (fp_echo_512 == NULL)
- {
- printf("Open echo data file fails!/n");
- }
- else
- {
- ncount = fread(echo_iq_512, sizeof(float), 2 * datasize_512, fp_echo_512);
- if (ncount != 2 * datasize_512)
- {
- printf("Read data from disk error, not completely!/n");
- }
- else
- {
- printf("Read echo data sucesses/n");
- fclose(fp_echo_512);
- }
- }
- printf("now, start to transform the [I Q] data to complex type/n");
- #pragma omp parallel for num_threads(8) private(i)
- for (i = 0; i < datasize_512; i++)
- {
- echodata_512[i].x = echo_iq_512[2 * i];
- echodata_512[i].y = echo_iq_512[2 * i + 1];
- }
- #pragma omp barrier
- /*free(echo_iq);
- echo_iq = NULL;*/
- printf("transform the [I Q] echo data to complex type successes/n");
- //428点傅里叶变换
- start1 = clock();
- printf("start to do 428 points fft/n");
- Complex *g_idata_428_fft;
- cufftHandle plan_428; //创建CUFFT句柄
- //在GPU上为信号开辟空间
- cutilSafeCall(cudaMalloc((void **)&g_idata_428_fft, datasize_428 * 8));
- //将开辟的显存全部置零
- cutilSafeCall(cudaMemset((void *)g_idata_428_fft, '/0', datasize_428 * 8));
- //拷贝内存信号到显存
- cutilSafeCall(cudaMemcpy(g_idata_428_fft, echodata_428, datasize_428 * 8, cudaMemcpyHostToDevice));
- //start1 = clock();
- cufftSafeCall(cufftPlan1d(&plan_428, 428, CUFFT_C2C, 16384));
- cufftSafeCall(cufftExecC2C(plan_428, (cufftComplex *)g_idata_428_fft, (Complex *)g_idata_428_fft, CUFFT_FORWARD));
- end1= clock();
- //拷贝显存数据到内存
- cutilSafeCall(cudaMemcpy(echodata_428, g_idata_428_fft, datasize_428 * 8, cudaMemcpyDeviceToHost));
- cufftSafeCall(cufftDestroy(plan_428));
- cutilSafeCall(cudaFree(g_idata_428_fft));
- printf("428 points fft finishes/n");
- //512点傅里叶变换
- start2 = clock();
- printf("start to do 512 points fft/n");
- Complex *g_idata_512_fft;
- cufftHandle plan_512; //创建CUFFT句柄
- //在GPU上为信号开辟空间
- cutilSafeCall(cudaMalloc((void **)&g_idata_512_fft, datasize_512 * 8));
- //将开辟的显存全部置零
- cutilSafeCall(cudaMemset((void *)g_idata_512_fft, '/0', datasize_512 * 8));
- //拷贝内存信号到显存
- cutilSafeCall(cudaMemcpy(g_idata_512_fft, echodata_512, datasize_512 * 8, cudaMemcpyHostToDevice));
- //start2 = clock();
- cufftSafeCall(cufftPlan1d(&plan_512, 512, CUFFT_C2C, 16384));
- cufftSafeCall(cufftExecC2C(plan_512, (cufftComplex *)g_idata_512_fft, (Complex *)g_idata_512_fft, CUFFT_FORWARD));
- end2 = clock();
- //拷贝显存数据到内存
- cutilSafeCall(cudaMemcpy(echodata_512, g_idata_512_fft, datasize_512 * 8, cudaMemcpyDeviceToHost));
- cufftSafeCall(cufftDestroy(plan_512));
- cutilSafeCall(cudaFree(g_idata_512_fft));
- printf("512 points fft finishes/n");
- end = clock();
- double duration = double(end - start)/ CLOCKS_PER_SEC;
- printf("This Tool uses %f s/n", duration);
- double duration1 = double(end1 - start1)/ CLOCKS_PER_SEC;
- printf("428 points fft uses %f s/n", duration1);
- double duration2 = double(end2 - start2)/ CLOCKS_PER_SEC;
- printf("512 points fft uses %f s/n", duration2);
- getchar();
- return 0;
- }