#include<stdlib.h>
#include<string.h>
#include<time.h>
#include<stdio.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include<Windows.h>
#include<math.h>
#define PI acos(-1)
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
class TimeUse {
public:
LARGE_INTEGER freq, tBegin, tEnd;
TimeUse() {
QueryPerformanceFrequency(&freq);
}
void Start() {
QueryPerformanceCounter(&tBegin);
}
void End() {
QueryPerformanceCounter(&tEnd);
}
double Gap() {
auto time = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;
return time;
}
void show() {
auto time = (double)(tEnd.QuadPart - tBegin.QuadPart) / (double)freq.QuadPart;
printf("cpu cost %lf s\n", time);
}
};
TimeUse counter;
__device__ __host__ struct Complex {
double x, y;
//Complex(double x,double y){this->x=x,this->y=y;}
__device__ __host__ Complex operator+(const Complex& b)const {
return { this->x + b.x,this->y + b.y };
}
__device__ __host__ Complex operator-(const Complex& b)const {
return { this->x - b.x,this->y - b.y };
}
__device__ __host__ Complex operator*(const Complex& b)const {
return { (this->x * b.x - this->y * b.y),(this->x * b.y + this->y * b.x) };
}
};
int tot; //总长度,为2的整数次方
int* rev; //逆序数
int bit; //2的多少次方
Complex* data;
int* gpurev;
Complex* gpudata;
__host__ void cpuShowData() {
printf("本层数据:\n");
for (int i = 0; i < tot; i++) {
printf("(%lf,%lf),", data[i].x, data[i].y);
}puts("");
}
__device__ void gpuShowData(Complex *gpudata,int tot) {
printf("gpu本层数据:\n");
for (int i = 0; i < tot; i++) {
printf("(%lf,%lf),", gpudata[i].x, gpudata[i].y);
}printf("\n");
}
__host__ void cpuGetTot(int len) { //计算tot和bit
bit = 0;
while (1 << bit < len)bit++;
tot = 1 << bit;
}
__host__ void cpuFftMalloc() {
rev = (int*)malloc(sizeof(int) * tot);
data = (Complex*)malloc(sizeof(Complex) * tot);
}
__host__ void gpuFftMalloc() {
CHECK(cudaMalloc((Complex**)&gpudata,tot*sizeof(Complex)));
CHECK(cudaMalloc((int**)&gpurev, tot*sizeof(int)));
}
__host__ void moveCputoGpu() {
CHECK(cudaMemcpy(gpudata, data, tot*sizeof(Complex), cudaMemcpyHostToDevice));
}
__host__ void moveGputoCpu() {
CHECK(cudaMemcpy(data,gpudata, tot*sizeof(Complex), cudaMemcpyDeviceToHost));
}
__host__ void cpuRandomData() {
srand(time(NULL));
for (int i = 0; i < tot; i++) {
data[i].x = i;
data[i].y = 0;
//printf("%lf,",data[i].x);
}//puts("");
}
__host__ void gpuFftFree() {
cudaFree(gpurev);
cudaFree(gpudata);
}
__host__ void cpuFftFree() {
free(rev);
free(data);
}
__host__ void cpuReverse() {
rev[0] = 0;
for (int i = 0; i < tot; i++) {
rev[i] = ((rev[i >> 1] >> 1) | ((i & 1) << (bit - 1)));
}
}
__global__ void gpuReverse(int *gpurev,int bit) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
gpurev[idx] = 0;
for (int i = 0; i < bit; i++) {
gpurev[idx] |= ((idx >> i) & 1) << (bit - 1 - i);
}
}
__host__ void ajustSeq() {
for (int i = 0; i < tot; i++) {
if (i < rev[i]) {
Complex tmp({ data[i].x,data[i].y });//swap
data[i].x = data[rev[i]].x;
data[i].y = data[rev[i]].y;
data[rev[i]].x = tmp.x;
data[rev[i]].y = tmp.y;
}
}
}
__global__ void gpuAjust(Complex *gpudata,int *gpurev) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < gpurev[i]) {
Complex tmp({ gpudata[i].x,gpudata[i].y });//swap
gpudata[i].x = gpudata[gpurev[i]].x;
gpudata[i].y = gpudata[gpurev[i]].y;
gpudata[gpurev[i]].x = tmp.x;
gpudata[gpurev[i]].y = tmp.y;
}
}
__host__ void cpuButterfly(int inv) {
for (int mid = 1; mid < tot; mid <<= 1) {
Complex w1 = Complex{ cos(PI / mid),inv * sin(PI / mid) };
for (int i = 0; i < tot; i += (mid << 1)) {
Complex wk = Complex({ 1,0 });
for (int j = 0; j < mid; j++, wk = wk * w1) {
Complex x = data[i + j];
Complex y = wk * data[i + j + mid];
data[i + j] = x + y;
data[i + j + mid] = x - y;
}
//cpuShowData();
}
}
}
__device__ Complex qmi(Complex a, int b) {
Complex ans = { 1,0 };
while (b) {
if (b & 1)ans = ans * a;
a = a * a;
b >>= 1;
}
return ans;
}
__device__ void gpuShow(Complex *gpudata,int tot) {
for (int i = 0; i < tot; i++) {
printf("(%lf,%lf),",gpudata[i].x,gpudata[i].y);
}printf("\n");
}
__global__ void gpufft(Complex *gpudata, int mid, Complex w1, int tot) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = i / mid * mid * 2 + i % mid;
i = j;
Complex wk = qmi(w1, i % mid);
Complex x = gpudata[i];
Complex y = wk * gpudata[i + mid];
gpudata[i] = x + y;
gpudata[i + mid] = x - y;
//gpuShowData(gpudata,tot);
}
__global__ void gpuDiv(Complex *gpudata,int tot) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
gpudata[i].x/= tot;
gpudata[i].y /= tot;
}
__host__ void gpuShowRev() {
int* d =(int*) malloc(sizeof(int) * tot);
CHECK(cudaMemcpy(d, gpurev, tot * sizeof(int), cudaMemcpyDeviceToHost));
for (int i = 0; i < tot; i++) {
printf("%d ",d[i]);
}printf("\n");
}
__host__ void gpuCallFft(int inv,int len) {
cudaDeviceReset();
counter.Start();
cpuGetTot(len);
gpuFftMalloc();
moveCputoGpu();
dim3 block(len % 1024);
dim3 grid(len / 1024 +1);
dim3 block2(4);
//printf("block: %d grid: %d\n", len % 1024, len / 1024 + 1);
gpuReverse << <grid, block >> > (gpurev,bit);
//gpuShowRev();
CHECK(cudaDeviceSynchronize());
gpuAjust << <grid, block >> > (gpudata,gpurev);
CHECK(cudaDeviceSynchronize());
for (int mid = 1; mid < tot; mid <<= 1) {
Complex w1 = Complex{ cos(PI / mid),inv * sin(PI / mid) };
gpufft << <grid, block2 >> > (gpudata,mid, w1, tot);
CHECK(cudaDeviceSynchronize());
}
if (inv == -1)gpuDiv <<< grid, block >> > (gpudata,tot);
counter.End();
counter.show();
moveGputoCpu();
gpuFftFree();
}
__host__ void cpuDiv() {
for (int i = 0; i < tot; i++) {
data[i].x /= tot;
data[i].y /= tot;
}
}
__host__ void cpuCallFft(int len) {
cpuGetTot(len);
counter.Start();
cpuFftMalloc();
cpuReverse();
cpuRandomData();
ajustSeq();
cpuButterfly(1);
counter.End();
counter.show();
//cpuShowData();
//ajustSeq();
//cpuButterfly(-1);
//cpuDiv();
//cpuShowData();
//cpuFftFree();
}
int main(int argc, char** argv) {
TimeUse tt;
cpuCallFft(20000000);
//cpuShowData();
gpuCallFft(1, 20000000);
//gpuCallFft(-1, 100000);
//tt.show();
//cpuShowData();
cpuFftFree();
}
cpu and cuda fft
最新推荐文章于 2023-05-17 14:20:29 发布