cpu and cuda fft

#include<stdlib.h>
#include<string.h>
#include<time.h>
#include<stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<Windows.h>
#include<math.h>
#define PI acos(-1)
#define CHECK(call) \
{ \
 const cudaError_t error = call; \
 if (error != cudaSuccess) \
 { \
 printf("Error: %s:%d, ", __FILE__, __LINE__); \
 printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
 exit(1); \
 } \
}


class TimeUse {
public:
	LARGE_INTEGER freq, tBegin, tEnd;

	TimeUse() {
		QueryPerformanceFrequency(&freq);
	}
	void Start() {
		QueryPerformanceCounter(&tBegin);
	}
	void End() {
		QueryPerformanceCounter(&tEnd);
	}
	double Gap() {
		auto time = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;
		return time;
	}
	void show() {
		auto time = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;
		printf("cpu cost %lf s\n", time);
	}
};
TimeUse counter;
__device__ __host__ struct Complex {
	double x, y;
	//Complex(double x,double y){this->x=x,this->y=y;}
	__device__ __host__ Complex operator+(const Complex& b)const {
		return { this->x + b.x,this->y + b.y };
	}
	__device__ __host__ Complex operator-(const Complex& b)const {
		return { this->x - b.x,this->y - b.y };
	}
	__device__ __host__ Complex operator*(const Complex& b)const {
		return { (this->x * b.x - this->y * b.y),(this->x * b.y + this->y * b.x) };
	}
};


int tot; //总长度,为2的整数次方
int* rev; //逆序数
int bit; //2的多少次方
Complex* data;
int* gpurev;
Complex* gpudata;

__host__ void cpuShowData() {
	printf("本层数据:\n");
	for (int i = 0; i < tot; i++) {
		printf("(%lf,%lf),", data[i].x, data[i].y);
	}puts("");
}
__device__ void gpuShowData(Complex *gpudata,int tot) {
	printf("gpu本层数据:\n");
	for (int i = 0; i < tot; i++) {
		printf("(%lf,%lf),", gpudata[i].x, gpudata[i].y);
	}printf("\n");
}
__host__ void cpuGetTot(int len) { //计算tot和bit
	bit = 0;
	while (1 << bit < len)bit++;
	tot = 1 << bit;
}

__host__ void cpuFftMalloc() {
	rev = (int*)malloc(sizeof(int) * tot);
	data = (Complex*)malloc(sizeof(Complex) * tot);
}
__host__ void gpuFftMalloc() {
	CHECK(cudaMalloc((Complex**)&gpudata,tot*sizeof(Complex)));
	CHECK(cudaMalloc((int**)&gpurev, tot*sizeof(int)));
}
__host__ void moveCputoGpu() {
	CHECK(cudaMemcpy(gpudata, data, tot*sizeof(Complex), cudaMemcpyHostToDevice));
}
__host__ void moveGputoCpu() {
	CHECK(cudaMemcpy(data,gpudata, tot*sizeof(Complex), cudaMemcpyDeviceToHost));
}

__host__ void cpuRandomData() {
	srand(time(NULL));
	for (int i = 0; i < tot; i++) {
		data[i].x = i;
		data[i].y = 0;
		//printf("%lf,",data[i].x);
	}//puts("");
}

__host__ void gpuFftFree() {
	cudaFree(gpurev);
	cudaFree(gpudata);
}
__host__ void cpuFftFree() {
	free(rev);
	free(data);
}

__host__ void cpuReverse() {
	rev[0] = 0;
	for (int i = 0; i < tot; i++) {
		rev[i] = ((rev[i >> 1] >> 1) | ((i & 1) << (bit - 1)));
	}
}

__global__ void gpuReverse(int *gpurev,int bit) {
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	gpurev[idx] = 0;
	for (int i = 0; i < bit; i++) {
		gpurev[idx] |= ((idx >> i) & 1) << (bit - 1 - i);
	}
}

__host__ void ajustSeq() {
	for (int i = 0; i < tot; i++) {
		if (i < rev[i]) {
			Complex tmp({ data[i].x,data[i].y });//swap
			data[i].x = data[rev[i]].x;
			data[i].y = data[rev[i]].y;
			data[rev[i]].x = tmp.x;
			data[rev[i]].y = tmp.y;
		}
	}
}


__global__ void gpuAjust(Complex *gpudata,int *gpurev) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	if (i < gpurev[i]) {
			Complex tmp({ gpudata[i].x,gpudata[i].y });//swap
			gpudata[i].x = gpudata[gpurev[i]].x;
			gpudata[i].y = gpudata[gpurev[i]].y;
			gpudata[gpurev[i]].x = tmp.x;
			gpudata[gpurev[i]].y = tmp.y;
	}

}
__host__ void cpuButterfly(int inv) {

	for (int mid = 1; mid < tot; mid <<= 1) {
		Complex w1 = Complex{ cos(PI / mid),inv * sin(PI / mid) };
		for (int i = 0; i < tot; i += (mid << 1)) {
			Complex wk = Complex({ 1,0 });
			for (int j = 0; j < mid; j++, wk = wk * w1) {
				Complex x = data[i + j];
				Complex y = wk * data[i + j + mid];
				data[i + j] = x + y;
				data[i + j + mid] = x - y;
			}
			//cpuShowData();
		}
	}
}
__device__ Complex qmi(Complex a, int b) {
	Complex ans = { 1,0 };
	while (b) {
		if (b & 1)ans = ans * a;
		a = a * a;
		b >>= 1;
	}
	return ans;
}
__device__ void gpuShow(Complex *gpudata,int tot) {
	for (int i = 0; i < tot; i++) {
		printf("(%lf,%lf),",gpudata[i].x,gpudata[i].y);
	}printf("\n");
}
__global__ void gpufft(Complex *gpudata, int mid, Complex w1, int tot) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	int j = i / mid * mid * 2 + i % mid;
	i = j;
	Complex wk = qmi(w1, i % mid);
	Complex x = gpudata[i];
	Complex y = wk * gpudata[i + mid];
	gpudata[i] = x + y;
	gpudata[i + mid] = x - y;
	//gpuShowData(gpudata,tot);
}
__global__ void gpuDiv(Complex *gpudata,int tot) {
	int i = blockIdx.x * blockDim.x + threadIdx.x;
	gpudata[i].x/= tot;
	gpudata[i].y /= tot;
}
__host__ void gpuShowRev() {
	int* d =(int*) malloc(sizeof(int) * tot);
	CHECK(cudaMemcpy(d, gpurev, tot * sizeof(int), cudaMemcpyDeviceToHost));
	for (int i = 0; i < tot; i++) {
		printf("%d ",d[i]);
	}printf("\n");
}
__host__ void gpuCallFft(int inv,int len) {
	
	cudaDeviceReset();
	counter.Start();
	cpuGetTot(len);
	gpuFftMalloc();
	moveCputoGpu();
	
	dim3 block(len % 1024);
	dim3 grid(len / 1024 +1);
	dim3 block2(4);
	//printf("block: %d grid: %d\n", len % 1024, len / 1024 + 1);
	gpuReverse << <grid, block >> > (gpurev,bit);
	//gpuShowRev();
	CHECK(cudaDeviceSynchronize());
	gpuAjust << <grid, block >> > (gpudata,gpurev);
	CHECK(cudaDeviceSynchronize());
	for (int mid = 1; mid < tot; mid <<= 1) {
		Complex w1 = Complex{ cos(PI / mid),inv * sin(PI / mid) };
		gpufft << <grid, block2 >> > (gpudata,mid, w1, tot);
		CHECK(cudaDeviceSynchronize());
	}
	if (inv == -1)gpuDiv <<< grid, block >> > (gpudata,tot);
	counter.End();
	counter.show();
	moveGputoCpu();
	gpuFftFree();
}


__host__ void cpuDiv() {
	for (int i = 0; i < tot; i++) {
		data[i].x /= tot;
		data[i].y /= tot;
	}
}
__host__ void cpuCallFft(int len) {
	cpuGetTot(len);
	counter.Start();
	cpuFftMalloc();
	
	cpuReverse();
	cpuRandomData();
	ajustSeq();
	cpuButterfly(1);
	counter.End();
	counter.show();
	//cpuShowData();
	//ajustSeq();
	//cpuButterfly(-1);
	//cpuDiv();
	//cpuShowData();
	//cpuFftFree();
}

int main(int argc, char** argv) {
	TimeUse tt;
	cpuCallFft(20000000);

	//cpuShowData();
	gpuCallFft(1, 20000000);
	//gpuCallFft(-1, 100000);

	//tt.show();
	//cpuShowData();
	cpuFftFree();
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
This document describes CUFFT, the NVIDIA® CUDA™ Fast Fourier Transform (FFT) library. The FFT is a divide-and-conquer algorithm for efficiently computing discrete Fourier transforms of complex or real-valued data sets. It is one of the most important and widely used numerical algorithms in computational physics and general signal processing. The CUFFT library provides a simple interface for computing parallel FFTs on an NVIDIA GPU, which allows users to leverage the floating-point power and parallelism of the GPU without having to develop a custom, CUDA FFT implementation. FFT libraries typically vary in terms of supported transform sizes and data types. For example, some libraries only implement radix-2 FFTs, restricting the transform size to a power of two. The CUFFT Library aims to support a wide range of FFT options efficiently on NVIDIA GPUs. This version of the CUFFT library supports the following features: I Complex and real-valued input and output I 1D, 2D, and 3D transforms I Batch execution for doing multiple transforms of any dimension in parallel I Transform sizes up to 64 million elements in single precision and up to 128 million elements in double precision in any dimension, limited by the available GPU memory I In-place and out-of-place transforms I Double-precision (64-bit floating point) on compatible hardware (sm1.3 and later) I Support for streamed execution, enabling asynchronous computation and data movement I FFTW compatible data layouts I Arbitrary intra- and inter-dimension element strides I Thread-safe API that can be called from multiple independent host threads
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值