雷达信号处理算法GPU加速（含完整代码）

后来居上_m
已于 2024-07-24 16:09:58 修改
阅读量1.9k
点赞数 6
分类专栏：项目经验文章标签：算法信号处理 c语言经验分享 linux
于 2023-07-27 19:42:11 首次发布
本文链接：https://blog.csdn.net/weixin_45206081/article/details/131967183
版权
项目经验专栏收录该内容
7 篇文章
订阅专栏
雷达信号处理完整流程 GPU加速系列文章目录

基于CUDA对信号处理算法加速优化（优化，pc,mti,mtd,cfar,转置）

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"
/*#include "sys/time.h"
double what_time_is_it_now() {
    struct timeval time;
    if (gettimeofday(&time, NULL)) {
        return 0;
    }
    return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
*/

#define BATCH 1
#define SIZE 8192 * 64 * 16

#define M 8192
#define N 64 * 16
#define block_size 16

__device__ int  N_prot_R = 1;
__device__ int N_ref_R = 2;
__device__ float alpha = 3; 
__device__ int N_prot_F = 25;
__device__ int N_ref_F = 150;
__device__ float alpha1 = 1;

__global__ void complexMulKernel(const cuComplex* r_sf, const cuComplex* r_hf, cuComplex* sot1, int num) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < num) {
        sot1[i] = cuCmulf(r_sf[i], r_hf[i]);
    }
}

__global__ void cuCmulfKernel(cufftComplex* a, int sizeA, cufftComplex* b, int sizeB, cufftComplex* res) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;

    int bIndex = index % sizeB;
    res[index] = cuCmulf(a[index], b[bIndex]);
}

__global__ void scalePCResult(cufftComplex* d_PC_Result, int length) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < length) {
        d_PC_Result[index].x = d_PC_Result[index].x / 8192.0f;
        d_PC_Result[index].y = d_PC_Result[index].y / 8192.0f;
    }
}

__global__ void subtractMti(cufftComplex* a, cufftComplex* b, int numRows , int numCols) {
    int index = threadIdx.x + blockIdx.x * blockDim.x;

    if (index < numRows - 1) {
        for (int j = 0; j < numCols; j++) {
            int idx = index * numCols + j;
            b[idx] = { a[(index + 1) * numCols + j].x - a[index * numCols + j].x, a[(index + 1) * numCols + j].y - a[index * numCols + j].y };
        }
    }
}
/*-----------------转为行优先-----------*/
__global__ void convertColumnToRow(cufftComplex* d_PC_Result, cufftComplex* d_PC_Result_Row, int numColumns, int numRows) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < numColumns * numRows) {
        int row = tid / numColumns;
        int col = tid % numColumns;

        int index = row + col * numRows;
        d_PC_Result_Row[tid] = d_PC_Result[index];
    }
}
/*-----------------转为列优先-----------*/
__global__ void convertRowToColumn(cufftComplex* d_PC_Result, cufftComplex* d_PC_Result_Row, int numColumns, int numRows) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < numColumns * numRows) {
        int col = tid / numRows;
        int row = tid % numRows; //1 2 3

        int index = col + row * numColumns;
        d_PC_Result_Row[tid] = d_PC_Result[index];
    }
}

__global__ void convertColumnToRow2(float* d_PC_Result, float* d_PC_Result_Row, int numColumns, int numRows) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < numColumns * numRows) {
        int row = tid / numColumns;
        int col = tid % numColumns;

        int index = row + col * numRows;
        d_PC_Result_Row[tid] = d_PC_Result[index];
    }
}

__global__ void fftshift(cufftComplex* r_sot, int arrayLength, int subArrayLength)
{
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int numSubArrays = arrayLength / subArrayLength;

    if (tid < numSubArrays)
    {
        int startIndex = tid * subArrayLength;
        int endIndex = startIndex + subArrayLength;

        // Swap elements within the subarray
        for (int j = 0; j < subArrayLength / 2; j++)
        {
            int index1 = startIndex + j;
            int index2 = endIndex - 1 - j;

            cufftComplex temp = r_sot[index1];
            r_sot[index1] = r_sot[index2];
            r_sot[index2] = temp;
        }
    }
}

__global__ void complexAbsKernel(cufftComplex* input, float* output, int num) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < num) {
        float re = input[i].x;
        float im = input[i].y;
        output[i] = sqrtf(re * re + im * im);
    }
}


__global__ void subtractMti2(cufftComplex* a, cufftComplex* b, int numRows) {
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if ((tid + 1) % numRows != 0 && tid < numRows * 64) {  
        a[tid].x = b[tid + 1].x - b[tid].x;
        a[tid].y = b[tid + 1].y - b[tid].y;
    }
}

/*===============================CFAR========================*/
__global__ void cfar_2d_kernel(float* PC_data_ifft_CA_abs, float* threshold_R, int M_row, int N_point, int* dev_location_R) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    if (i < M_row && j < N_point) {

        if (j <= (N_prot_R + N_ref_R)) {
            float sum_R = 0;
            for (int k = j + N_prot_R + 1; k <= j + N_prot_R + N_ref_R; k++) {
                sum_R += PC_data_ifft_CA_abs[k * M_row + i];
            }
            threshold_R[j * M_row + i] = sum_R / N_ref_R * alpha;

            if (PC_data_ifft_CA_abs[j * M_row + i] > (1 * threshold_R[j * M_row + i])) {
                dev_location_R[j * M_row + i] = 1;
            }
            else {
                dev_location_R[j * M_row + i] = 0;
            }
        }
        else if (j >= (N_point - N_prot_R - N_ref_R)) {
            float sum_R = 0;
            for (int k = j - N_prot_R - N_ref_R; k <= j - N_prot_R - 1; k++) {
                sum_R += PC_data_ifft_CA_abs[k * M_row + i];
            }
            threshold_R[j * M_row + i] = (sum_R / N_ref_R) * alpha;

            if (PC_data_ifft_CA_abs[j * M_row + i] > (1 * threshold_R[j * M_row + i])) {
                dev_location_R[j * M_row + i] = 1;
            }
            else {
                dev_location_R[j * M_row + i] = 0;
            }
        }
        else {
            float sum_R_left = 0;
            float sum_R_right = 0;
            for (int k = j - N_prot_R - N_ref_R; k <= j - N_prot_R - 1; k++) {
                sum_R_left += PC_data_ifft_CA_abs[k * M_row + i];
            }
            for (int k = j + N_prot_R + 1; k <= j + N_prot_R + N_ref_R; k++) {
                sum_R_right += PC_data_ifft_CA_abs[k * M_row + i];
            }
            threshold_R[j * M_row + i] = (sum_R_left + sum_R_right) / (N_ref_R * 2) * alpha;

            if (PC_data_ifft_CA_abs[j * M_row + i] > (1 * threshold_R[j * M_row + i])) {
                dev_location_R[j * M_row + i] = 1;
            }
            else {
                dev_location_R[j * M_row + i] = 0;
            }
        }
    }
}

__global__ void cfar_2d_kernel2(float* PC_data_ifft_CA_abs, int* location, float* threshold_F, int M_row, int N_point) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;

    if (i < M_row && j < N_point) {
        if (location[i + j * M_row] == 1 && PC_data_ifft_CA_abs[i + j * M_row] != 0) {
            if (i <= (N_prot_F + N_ref_F)) { //观察点过于偏左，左侧点数不足
                float sum_F = 0;
                for (int k = i + N_prot_F + 1; k <= i + N_prot_F + N_ref_F; k++) {
                    sum_F += PC_data_ifft_CA_abs[k + j * M_row];
                }
                threshold_F[i + j * M_row] = (sum_F / N_ref_F) * alpha; //观察点右侧参考点相加求平均
                if (PC_data_ifft_CA_abs[i + j * M_row] > (1 * threshold_F[i + j * M_row])) {
                    printf("%f\n", PC_data_ifft_CA_abs[i + j * M_row]);
                }
            }
            else if (i >= (M_row - N_prot_F - N_ref_F)) { //观察点过于偏右，右侧点数不足
                float sum_F = 0;
                for (int k = i - N_prot_F - N_ref_F; k <= i - N_prot_F - 1; k++) {
                    sum_F += PC_data_ifft_CA_abs[k + j * M_row];
                }
                threshold_F[i + j * M_row] = (sum_F / N_ref_F) * alpha; //观察点左侧参考点求平均
                if (PC_data_ifft_CA_abs[i + j * M_row] > (1 * threshold_F[i + j * M_row])) {
                    printf("%f\n", PC_data_ifft_CA_abs[i + j * M_row]);
                }
            }
            else { //观察点居中，左右点数均足够
                float sum_F_left = 0;
                float sum_F_right = 0;
                for (int k = i - N_prot_F - N_ref_F; k <= i - N_prot_F - 1; k++) {
                    sum_F_left += PC_data_ifft_CA_abs[k + j * M_row];
                }
                for (int k = i + N_prot_F + 1; k <= i + N_prot_F + N_ref_F; k++) {
                    sum_F_right += PC_data_ifft_CA_abs[k + j * M_row];
                }
                threshold_F[i + j * M_row] = (fmax(sum_F_left, sum_F_right) / N_ref_F); //观察点两侧取大的一侧求均值
                //threshold_F[i + j * M_row] = ((sum_F_left + sum_F_right) / (N_ref_F) ) * alpha; //观察点两侧取大的一侧求均值
                if (PC_data_ifft_CA_abs[i + j * M_row] > (1 * threshold_F[i + j * M_row])) {
                    printf("%f\t i= %d j= %d\n", PC_data_ifft_CA_abs[i + j * M_row], i, j);
                    //printf("\n", i, j);
                }
            }
        }
        else {
            threshold_F[i + j * M_row] = 0;
        }
    }
}

int main() {
    /*----------------------start     读入DBF数据 ------------------*/
    cufftComplex* DBF_Result = NULL; // 定义指向cufftComplex类型的指针
    cudaMallocHost((void**)&DBF_Result, M * N * sizeof(cufftComplex)); // 申请内存空间

    int size = 0; // 定义数组长度变量
    int i = 0; // 计数器
    FILE* fp = fopen("F:/xxxx工作/radar8192_64模拟/DBF_Result.txt", "r"); // 打开文件
    if (fp == NULL) { // 判断文件是否打开成功
        printf("文件打开失败！\n");
        return 1; // 返回错误码1表示程序异常结束
    }

    while (fscanf(fp, "%*f %*f") != EOF) { // 统计文件中的数据数量
        size++; // 数组长度加1
    }

    rewind(fp); // 将文件指针重新指向文件开头

    for (i = 0; i < size; i++) { // 读取文件中的数据到数组中
        fscanf(fp, "%f %f", &DBF_Result[i].x, &DBF_Result[i].y);
    }

    fclose(fp); // 关闭文件


    for (int i = M * 64; i < M * N; i++) {
        
        DBF_Result[i].x = 0;
        DBF_Result[i].y = 0;   
    }
    printf("aaaaa= %f %f\n", DBF_Result[8192 * 64 * 13].x, DBF_Result[8192 * 66].y);

/*----------------------读入窗函数匹配滤波 h------------------*/
    cufftComplex* h = NULL; // 定义指向cufftComplex类型的指针
    cudaMallocHost((void**)&h, 420 * sizeof(cufftComplex)); // 申请内存空间

    size = 0; // 定义数组长度变量
    i = 0; // 计数器
    fp = fopen("F:/xxxx工作/radar8192_64模拟/h.txt", "r"); // 打开文件
    if (fp == NULL) { // 判断文件是否打开成功
        printf("文件打开失败！\n");
        return 1; // 返回错误码1表示程序异常结束
    }

    while (fscanf(fp, "%*f %*f") != EOF) { // 统计文件中的数据数量
        size++; // 数组长度加1
    }

    rewind(fp); // 将文件指针重新指向文件开头


    for (i = 0; i < size; i++) { // 读取文件中的数据到数组中
        fscanf(fp, "%f %f", &h[i].x, &h[i].y);
    }

    fclose(fp); // 关闭文件


/*----------------------------------end-------------------------*/
/*----------------------------------------------------------Project START-------------------------------------------------------------*/
    double start1, end1, start2, end2;
    //pc
    cufftComplex* d_h_out;
    cufftComplex* d_DBF_out;
    cufftComplex* d_cmul_out;
    cufftComplex* d_PC_Result;
    cufftComplex* d_PC_Result_Scale;
    cufftComplex* d_h;
    cufftComplex* d_DBF_Result;
    cudaMalloc((void**)&d_h_out, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_DBF_out, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_cmul_out, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_PC_Result, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_PC_Result_Scale, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_h, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_DBF_Result, SIZE * BATCH * sizeof(cufftComplex));

    cudaMemcpy(d_h, h, 420 * sizeof(cufftComplex), cudaMemcpyHostToDevice);
    cudaMemcpy(d_DBF_Result, DBF_Result, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
    // Create plans
    cufftHandle plan_h, plan_DBF_Result, plan_PC_Result;
    cufftPlan1d(&plan_h, M, CUFFT_C2C, 1);
    cufftPlan1d(&plan_DBF_Result, M, CUFFT_C2C, N);
    cufftPlan1d(&plan_PC_Result, M, CUFFT_C2C, N);
    // 定义grid和block的大小
    int blockSize = block_size * block_size;
    int gridSize = (SIZE + blockSize - 1) / blockSize;
    //mti
    cufftComplex* d_trans;
    cufftComplex* d_mti;
    cudaMalloc((void**)&d_trans, SIZE * BATCH * sizeof(cufftComplex));
    cudaMalloc((void**)&d_mti, SIZE * BATCH * sizeof(cufftComplex));

    //mtd
    cufftComplex* d_trans2;
    cudaMalloc((void**)&d_trans2, SIZE * BATCH * sizeof(cufftComplex));
    cufftComplex* d_mtd;
    cudaMalloc((void**)&d_mtd, SIZE * BATCH * sizeof(cufftComplex));
    cufftHandle plan_mtd;
    cufftPlan1d(&plan_mtd, M-1, CUFFT_C2C, N);
    cufftComplex* d_trans3;
    cudaMalloc((void**)&d_trans3, SIZE * BATCH * sizeof(cufftComplex));
    //cfar/
    float* d_PC_data_ifft_CA_abs;
    cudaMalloc((void**)&d_PC_data_ifft_CA_abs, SIZE * BATCH * sizeof(float));
    int M_row = M-1;
    int N_point = N;
    float* threshold_R = NULL;
    cudaMallocHost((void**)&threshold_R, M_row * N_point * sizeof(float));
    float* dev_threshold_R;
    cudaMalloc((void**)&dev_threshold_R, M_row * N_point * sizeof(float));
    int* dev_location_R = NULL;
    cudaMalloc((void**)&dev_location_R, M_row * N_point * sizeof(int));

    dim3 threadsPerBlock(block_size, block_size);   //block_size * block_size个线程, 因为双循环用到了threadIdx.x和threadIdx.y，所以用dim3
    dim3 numBlocks((M_row + threadsPerBlock.x - 1) / threadsPerBlock.x, (N_point + threadsPerBlock.y - 1) / threadsPerBlock.y); //正好包含所有数据

    float* threshold_F = NULL;
    cudaMallocHost((void**)&threshold_F, M_row * N_point * sizeof(float));
    float* dev_threshold_F;
    cudaMalloc((void**)&dev_threshold_F, M_row * N_point * sizeof(float));
    

    /*----------------------------------------------------------PC START-------------------------------------------------------------*/
        // forward FFT ------------------------------
    cufftExecC2C(plan_h, d_h, d_h_out, CUFFT_FORWARD);

    //*************************************time start*******************************
 //   start1 = what_time_is_it_now();
    cufftExecC2C(plan_DBF_Result, d_DBF_Result, d_DBF_out, CUFFT_FORWARD);
    cudaDeviceSynchronize();
    // kernel函数
    cuCmulfKernel << <gridSize, blockSize >> > (d_DBF_out, SIZE * BATCH, d_h_out, 8192, d_cmul_out);
    cudaDeviceSynchronize();

    cufftExecC2C(plan_PC_Result, d_cmul_out, d_PC_Result, CUFFT_INVERSE);   //---------ifft
    cudaDeviceSynchronize();

    scalePCResult << <gridSize, blockSize >> > (d_PC_Result, SIZE * BATCH); //为了与matlab的 ifft答案一致，需要除以计算长度 8192 
    cudaDeviceSynchronize();
    /*----------------------------------------------------------PC END-------------------------------------------------------------*/
    /*---------------------------------------------------------MTI START-------------------------------------------------------------*/

   // convertColumnToRow << <gridSize, blockSize >> > (d_PC_Result, d_trans, N, M);   //下一行减去上一行，需要把数据转为行优先!!!
  //  cudaDeviceSynchronize();

    subtractMti2 << <gridSize, blockSize >> > (d_mti, d_PC_Result, 8192);
    cudaDeviceSynchronize();

   
    /*---------------------------------------------------------MTI END-------------------------------------------------------------*/
    /*---------------------------------------------------------MTD START-------------------------------------------------------------*/

   // convertRowToColumn << <gridSize, blockSize >> > (d_mti, d_trans2, N, M - 1);   //mtd对每一列fft，需要把数据转为列优先!!!
  //  cudaDeviceSynchronize();

    cufftComplex* r_cmul = (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));
    cudaMemcpy(r_cmul, d_trans2, SIZE * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
  /*  for (int i = 0; i < 8196; i++)
    {
        printf("%d\t", i % 8192);
        printf("(%f, %f)\n", r_cmul[i].x, r_cmul[i].y);
    }*/

    cufftExecC2C(plan_mtd, d_mti, d_mtd, CUFFT_FORWARD);
    cudaDeviceSynchronize();

   /// convertColumnToRow << <gridSize, blockSize >> > (d_mtd, d_trans3, N, M - 1);   //需要把数据转为行优先, CFAR要用!!!
    //cudaDeviceSynchronize();
    /*---------------------------------------------------------MTD END-------------------------------------------------------------*/
    /*-------------------------------------------------------CFAR START-------------------------------------------------------------*/

        /*ABS*/
    complexAbsKernel << <gridSize, blockSize >> > (d_mtd, d_PC_data_ifft_CA_abs, SIZE * BATCH);
    cudaDeviceSynchronize();
    /*CFAR*/
    cfar_2d_kernel << <numBlocks, threadsPerBlock >> > (d_PC_data_ifft_CA_abs, dev_threshold_R, M_row, N_point, dev_location_R);
    cudaDeviceSynchronize();

    cfar_2d_kernel2 << <numBlocks, threadsPerBlock >> > (d_PC_data_ifft_CA_abs, dev_location_R, dev_threshold_F, M_row, N_point);
    cudaDeviceSynchronize();
    /*-------------------------------------------------------CFAR END-------------------------------------------------------------*/
 //   end1 = what_time_is_it_now();
 //   printf("\nsignal processing time : %f   ms\n ", 1000 * (end1 - start1) / 1000);

    // 释放内存
    cudaFree(d_h_out);
    cudaFree(d_DBF_out);
    cudaFree(d_cmul_out);
    cudaFree(d_h);
    cudaFree(d_DBF_Result);
    //释放plan
    cufftDestroy(plan_h);
    cufftDestroy(plan_DBF_Result);
    cufftDestroy(plan_PC_Result);

    cudaFree(d_PC_data_ifft_CA_abs);
    cudaFree(d_PC_data_ifft_CA_abs);
    cudaFree(threshold_F);



    return 0;
}