雷达信号处理完整流程 GPU加速系列文章目录
基于CUDA对信号处理算法加速优化(优化,pc,mti,mtd,cfar,转置)
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cufft.h>
#include"cuda_runtime_api.h"
#include"device_launch_parameters.h"
/*#include "sys/time.h"
double what_time_is_it_now() {
struct timeval time;
if (gettimeofday(&time, NULL)) {
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}
*/
#define BATCH 1
#define SIZE 8192 * 64 * 16
#define M 8192
#define N 64 * 16
#define block_size 16
__device__ int N_prot_R = 1;
__device__ int N_ref_R = 2;
__device__ float alpha = 3;
__device__ int N_prot_F = 25;
__device__ int N_ref_F = 150;
__device__ float alpha1 = 1;
__global__ void complexMulKernel(const cuComplex* r_sf, const cuComplex* r_hf, cuComplex* sot1, int num) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < num) {
sot1[i] = cuCmulf(r_sf[i], r_hf[i]);
}
}
__global__ void cuCmulfKernel(cufftComplex* a, int sizeA, cufftComplex* b, int sizeB, cufftComplex* res) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int bIndex = index % sizeB;
res[index] = cuCmulf(a[index], b[bIndex]);
}
__global__ void scalePCResult(cufftComplex* d_PC_Result, int length) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index < length) {
d_PC_Result[index].x = d_PC_Result[index].x / 8192.0f;
d_PC_Result[index].y = d_PC_Result[index].y / 8192.0f;
}
}
__global__ void subtractMti(cufftComplex* a, cufftComplex* b, int numRows , int numCols) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
if (index < numRows - 1) {
for (int j = 0; j < numCols; j++) {
int idx = index * numCols + j;
b[idx] = { a[(index + 1) * numCols + j].x - a[index * numCols + j].x, a[(index + 1) * numCols + j].y - a[index * numCols + j].y };
}
}
}
/*-----------------转为行优先-----------*/
__global__ void convertColumnToRow(cufftComplex* d_PC_Result, cufftComplex* d_PC_Result_Row, int numColumns, int numRows) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < numColumns * numRows) {
int row = tid / numColumns;
int col = tid % numColumns;
int index = row + col * numRows;
d_PC_Result_Row[tid] = d_PC_Result[index];
}
}
/*-----------------转为列优先-----------*/
__global__ void convertRowToColumn(cufftComplex* d_PC_Result, cufftComplex* d_PC_Result_Row, int numColumns, int numRows) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < numColumns * numRows) {
int col = tid / numRows;
int row = tid % numRows; //1 2 3
int index = col + row * numColumns;
d_PC_Result_Row[tid] = d_PC_Result[index];
}
}
__global__ void convertColumnToRow2(float* d_PC_Result, float* d_PC_Result_Row, int numColumns, int numRows) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < numColumns * numRows) {
int row = tid / numColumns;
int col = tid % numColumns;
int index = row + col * numRows;
d_PC_Result_Row[tid] = d_PC_Result[index];
}
}
__global__ void fftshift(cufftComplex* r_sot, int arrayLength, int subArrayLength)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int numSubArrays = arrayLength / subArrayLength;
if (tid < numSubArrays)
{
int startIndex = tid * subArrayLength;
int endIndex = startIndex + subArrayLength;
// Swap elements within the subarray
for (int j = 0; j < subArrayLength / 2; j++)
{
int index1 = startIndex + j;
int index2 = endIndex - 1 - j;
cufftComplex temp = r_sot[index1];
r_sot[index1] = r_sot[index2];
r_sot[index2] = temp;
}
}
}
__global__ void complexAbsKernel(cufftComplex* input, float* output, int num) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < num) {
float re = input[i].x;
float im = input[i].y;
output[i] = sqrtf(re * re + im * im);
}
}
__global__ void subtractMti2(cufftComplex* a, cufftComplex* b, int numRows) {
int tid = threadIdx.x + blockIdx.x * blockDim.x;
if ((tid + 1) % numRows != 0 && tid < numRows * 64) {
a[tid].x = b[tid + 1].x - b[tid].x;
a[tid].y = b[tid + 1].y - b[tid].y;
}
}
/*===============================CFAR========================*/
__global__ void cfar_2d_kernel(float* PC_data_ifft_CA_abs, float* threshold_R, int M_row, int N_point, int* dev_location_R) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < M_row && j < N_point) {
if (j <= (N_prot_R + N_ref_R)) {
float sum_R = 0;
for (int k = j + N_prot_R + 1; k <= j + N_prot_R + N_ref_R; k++) {
sum_R += PC_data_ifft_CA_abs[k * M_row + i];
}
threshold_R[j * M_row + i] = sum_R / N_ref_R * alpha;
if (PC_data_ifft_CA_abs[j * M_row + i] > (1 * threshold_R[j * M_row + i])) {
dev_location_R[j * M_row + i] = 1;
}
else {
dev_location_R[j * M_row + i] = 0;
}
}
else if (j >= (N_point - N_prot_R - N_ref_R)) {
float sum_R = 0;
for (int k = j - N_prot_R - N_ref_R; k <= j - N_prot_R - 1; k++) {
sum_R += PC_data_ifft_CA_abs[k * M_row + i];
}
threshold_R[j * M_row + i] = (sum_R / N_ref_R) * alpha;
if (PC_data_ifft_CA_abs[j * M_row + i] > (1 * threshold_R[j * M_row + i])) {
dev_location_R[j * M_row + i] = 1;
}
else {
dev_location_R[j * M_row + i] = 0;
}
}
else {
float sum_R_left = 0;
float sum_R_right = 0;
for (int k = j - N_prot_R - N_ref_R; k <= j - N_prot_R - 1; k++) {
sum_R_left += PC_data_ifft_CA_abs[k * M_row + i];
}
for (int k = j + N_prot_R + 1; k <= j + N_prot_R + N_ref_R; k++) {
sum_R_right += PC_data_ifft_CA_abs[k * M_row + i];
}
threshold_R[j * M_row + i] = (sum_R_left + sum_R_right) / (N_ref_R * 2) * alpha;
if (PC_data_ifft_CA_abs[j * M_row + i] > (1 * threshold_R[j * M_row + i])) {
dev_location_R[j * M_row + i] = 1;
}
else {
dev_location_R[j * M_row + i] = 0;
}
}
}
}
__global__ void cfar_2d_kernel2(float* PC_data_ifft_CA_abs, int* location, float* threshold_F, int M_row, int N_point) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < M_row && j < N_point) {
if (location[i + j * M_row] == 1 && PC_data_ifft_CA_abs[i + j * M_row] != 0) {
if (i <= (N_prot_F + N_ref_F)) { //观察点过于偏左,左侧点数不足
float sum_F = 0;
for (int k = i + N_prot_F + 1; k <= i + N_prot_F + N_ref_F; k++) {
sum_F += PC_data_ifft_CA_abs[k + j * M_row];
}
threshold_F[i + j * M_row] = (sum_F / N_ref_F) * alpha; //观察点右侧参考点相加求平均
if (PC_data_ifft_CA_abs[i + j * M_row] > (1 * threshold_F[i + j * M_row])) {
printf("%f\n", PC_data_ifft_CA_abs[i + j * M_row]);
}
}
else if (i >= (M_row - N_prot_F - N_ref_F)) { //观察点过于偏右,右侧点数不足
float sum_F = 0;
for (int k = i - N_prot_F - N_ref_F; k <= i - N_prot_F - 1; k++) {
sum_F += PC_data_ifft_CA_abs[k + j * M_row];
}
threshold_F[i + j * M_row] = (sum_F / N_ref_F) * alpha; //观察点左侧参考点求平均
if (PC_data_ifft_CA_abs[i + j * M_row] > (1 * threshold_F[i + j * M_row])) {
printf("%f\n", PC_data_ifft_CA_abs[i + j * M_row]);
}
}
else { //观察点居中,左右点数均足够
float sum_F_left = 0;
float sum_F_right = 0;
for (int k = i - N_prot_F - N_ref_F; k <= i - N_prot_F - 1; k++) {
sum_F_left += PC_data_ifft_CA_abs[k + j * M_row];
}
for (int k = i + N_prot_F + 1; k <= i + N_prot_F + N_ref_F; k++) {
sum_F_right += PC_data_ifft_CA_abs[k + j * M_row];
}
threshold_F[i + j * M_row] = (fmax(sum_F_left, sum_F_right) / N_ref_F); //观察点两侧取大的一侧求均值
//threshold_F[i + j * M_row] = ((sum_F_left + sum_F_right) / (N_ref_F) ) * alpha; //观察点两侧取大的一侧求均值
if (PC_data_ifft_CA_abs[i + j * M_row] > (1 * threshold_F[i + j * M_row])) {
printf("%f\t i= %d j= %d\n", PC_data_ifft_CA_abs[i + j * M_row], i, j);
//printf("\n", i, j);
}
}
}
else {
threshold_F[i + j * M_row] = 0;
}
}
}
int main() {
/*----------------------start 读入DBF数据 ------------------*/
cufftComplex* DBF_Result = NULL; // 定义指向cufftComplex类型的指针
cudaMallocHost((void**)&DBF_Result, M * N * sizeof(cufftComplex)); // 申请内存空间
int size = 0; // 定义数组长度变量
int i = 0; // 计数器
FILE* fp = fopen("F:/xxxx工作/radar8192_64模拟/DBF_Result.txt", "r"); // 打开文件
if (fp == NULL) { // 判断文件是否打开成功
printf("文件打开失败!\n");
return 1; // 返回错误码1表示程序异常结束
}
while (fscanf(fp, "%*f %*f") != EOF) { // 统计文件中的数据数量
size++; // 数组长度加1
}
rewind(fp); // 将文件指针重新指向文件开头
for (i = 0; i < size; i++) { // 读取文件中的数据到数组中
fscanf(fp, "%f %f", &DBF_Result[i].x, &DBF_Result[i].y);
}
fclose(fp); // 关闭文件
for (int i = M * 64; i < M * N; i++) {
DBF_Result[i].x = 0;
DBF_Result[i].y = 0;
}
printf("aaaaa= %f %f\n", DBF_Result[8192 * 64 * 13].x, DBF_Result[8192 * 66].y);
/*----------------------读入窗函数匹配滤波 h------------------*/
cufftComplex* h = NULL; // 定义指向cufftComplex类型的指针
cudaMallocHost((void**)&h, 420 * sizeof(cufftComplex)); // 申请内存空间
size = 0; // 定义数组长度变量
i = 0; // 计数器
fp = fopen("F:/xxxx工作/radar8192_64模拟/h.txt", "r"); // 打开文件
if (fp == NULL) { // 判断文件是否打开成功
printf("文件打开失败!\n");
return 1; // 返回错误码1表示程序异常结束
}
while (fscanf(fp, "%*f %*f") != EOF) { // 统计文件中的数据数量
size++; // 数组长度加1
}
rewind(fp); // 将文件指针重新指向文件开头
for (i = 0; i < size; i++) { // 读取文件中的数据到数组中
fscanf(fp, "%f %f", &h[i].x, &h[i].y);
}
fclose(fp); // 关闭文件
/*----------------------------------end-------------------------*/
/*----------------------------------------------------------Project START-------------------------------------------------------------*/
double start1, end1, start2, end2;
//pc
cufftComplex* d_h_out;
cufftComplex* d_DBF_out;
cufftComplex* d_cmul_out;
cufftComplex* d_PC_Result;
cufftComplex* d_PC_Result_Scale;
cufftComplex* d_h;
cufftComplex* d_DBF_Result;
cudaMalloc((void**)&d_h_out, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_DBF_out, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_cmul_out, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_PC_Result, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_PC_Result_Scale, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_h, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_DBF_Result, SIZE * BATCH * sizeof(cufftComplex));
cudaMemcpy(d_h, h, 420 * sizeof(cufftComplex), cudaMemcpyHostToDevice);
cudaMemcpy(d_DBF_Result, DBF_Result, SIZE * BATCH * sizeof(cufftComplex), cudaMemcpyHostToDevice);
// Create plans
cufftHandle plan_h, plan_DBF_Result, plan_PC_Result;
cufftPlan1d(&plan_h, M, CUFFT_C2C, 1);
cufftPlan1d(&plan_DBF_Result, M, CUFFT_C2C, N);
cufftPlan1d(&plan_PC_Result, M, CUFFT_C2C, N);
// 定义grid和block的大小
int blockSize = block_size * block_size;
int gridSize = (SIZE + blockSize - 1) / blockSize;
//mti
cufftComplex* d_trans;
cufftComplex* d_mti;
cudaMalloc((void**)&d_trans, SIZE * BATCH * sizeof(cufftComplex));
cudaMalloc((void**)&d_mti, SIZE * BATCH * sizeof(cufftComplex));
//mtd
cufftComplex* d_trans2;
cudaMalloc((void**)&d_trans2, SIZE * BATCH * sizeof(cufftComplex));
cufftComplex* d_mtd;
cudaMalloc((void**)&d_mtd, SIZE * BATCH * sizeof(cufftComplex));
cufftHandle plan_mtd;
cufftPlan1d(&plan_mtd, M-1, CUFFT_C2C, N);
cufftComplex* d_trans3;
cudaMalloc((void**)&d_trans3, SIZE * BATCH * sizeof(cufftComplex));
//cfar/
float* d_PC_data_ifft_CA_abs;
cudaMalloc((void**)&d_PC_data_ifft_CA_abs, SIZE * BATCH * sizeof(float));
int M_row = M-1;
int N_point = N;
float* threshold_R = NULL;
cudaMallocHost((void**)&threshold_R, M_row * N_point * sizeof(float));
float* dev_threshold_R;
cudaMalloc((void**)&dev_threshold_R, M_row * N_point * sizeof(float));
int* dev_location_R = NULL;
cudaMalloc((void**)&dev_location_R, M_row * N_point * sizeof(int));
dim3 threadsPerBlock(block_size, block_size); //block_size * block_size个线程, 因为双循环用到了threadIdx.x和threadIdx.y,所以用dim3
dim3 numBlocks((M_row + threadsPerBlock.x - 1) / threadsPerBlock.x, (N_point + threadsPerBlock.y - 1) / threadsPerBlock.y); //正好包含所有数据
float* threshold_F = NULL;
cudaMallocHost((void**)&threshold_F, M_row * N_point * sizeof(float));
float* dev_threshold_F;
cudaMalloc((void**)&dev_threshold_F, M_row * N_point * sizeof(float));
/*----------------------------------------------------------PC START-------------------------------------------------------------*/
// forward FFT ------------------------------
cufftExecC2C(plan_h, d_h, d_h_out, CUFFT_FORWARD);
//*************************************time start*******************************
// start1 = what_time_is_it_now();
cufftExecC2C(plan_DBF_Result, d_DBF_Result, d_DBF_out, CUFFT_FORWARD);
cudaDeviceSynchronize();
// kernel函数
cuCmulfKernel << <gridSize, blockSize >> > (d_DBF_out, SIZE * BATCH, d_h_out, 8192, d_cmul_out);
cudaDeviceSynchronize();
cufftExecC2C(plan_PC_Result, d_cmul_out, d_PC_Result, CUFFT_INVERSE); //---------ifft
cudaDeviceSynchronize();
scalePCResult << <gridSize, blockSize >> > (d_PC_Result, SIZE * BATCH); //为了与matlab的 ifft答案一致,需要除以计算长度 8192
cudaDeviceSynchronize();
/*----------------------------------------------------------PC END-------------------------------------------------------------*/
/*---------------------------------------------------------MTI START-------------------------------------------------------------*/
// convertColumnToRow << <gridSize, blockSize >> > (d_PC_Result, d_trans, N, M); //下一行减去上一行,需要把数据转为行优先!!!
// cudaDeviceSynchronize();
subtractMti2 << <gridSize, blockSize >> > (d_mti, d_PC_Result, 8192);
cudaDeviceSynchronize();
/*---------------------------------------------------------MTI END-------------------------------------------------------------*/
/*---------------------------------------------------------MTD START-------------------------------------------------------------*/
// convertRowToColumn << <gridSize, blockSize >> > (d_mti, d_trans2, N, M - 1); //mtd对每一列fft,需要把数据转为列优先!!!
// cudaDeviceSynchronize();
cufftComplex* r_cmul = (cufftComplex*)malloc(SIZE * BATCH * sizeof(cufftComplex));
cudaMemcpy(r_cmul, d_trans2, SIZE * sizeof(cufftComplex), cudaMemcpyDeviceToHost);
/* for (int i = 0; i < 8196; i++)
{
printf("%d\t", i % 8192);
printf("(%f, %f)\n", r_cmul[i].x, r_cmul[i].y);
}*/
cufftExecC2C(plan_mtd, d_mti, d_mtd, CUFFT_FORWARD);
cudaDeviceSynchronize();
/// convertColumnToRow << <gridSize, blockSize >> > (d_mtd, d_trans3, N, M - 1); //需要把数据转为行优先, CFAR要用!!!
//cudaDeviceSynchronize();
/*---------------------------------------------------------MTD END-------------------------------------------------------------*/
/*-------------------------------------------------------CFAR START-------------------------------------------------------------*/
/*ABS*/
complexAbsKernel << <gridSize, blockSize >> > (d_mtd, d_PC_data_ifft_CA_abs, SIZE * BATCH);
cudaDeviceSynchronize();
/*CFAR*/
cfar_2d_kernel << <numBlocks, threadsPerBlock >> > (d_PC_data_ifft_CA_abs, dev_threshold_R, M_row, N_point, dev_location_R);
cudaDeviceSynchronize();
cfar_2d_kernel2 << <numBlocks, threadsPerBlock >> > (d_PC_data_ifft_CA_abs, dev_location_R, dev_threshold_F, M_row, N_point);
cudaDeviceSynchronize();
/*-------------------------------------------------------CFAR END-------------------------------------------------------------*/
// end1 = what_time_is_it_now();
// printf("\nsignal processing time : %f ms\n ", 1000 * (end1 - start1) / 1000);
// 释放内存
cudaFree(d_h_out);
cudaFree(d_DBF_out);
cudaFree(d_cmul_out);
cudaFree(d_h);
cudaFree(d_DBF_Result);
//释放plan
cufftDestroy(plan_h);
cufftDestroy(plan_DBF_Result);
cufftDestroy(plan_PC_Result);
cudaFree(d_PC_data_ifft_CA_abs);
cudaFree(d_PC_data_ifft_CA_abs);
cudaFree(threshold_F);
return 0;
}
总结
工作记录,优化后加速明显提升。