上个星期在写常规窄带波束形成的CUDA加速代码时,发现每次CUDA程序运行的结果都不一致,检查了编程逻辑和方法,却找不到相关问题,一直到这周突然灵光乍现,感觉可能是线程分配的问题,一检查果然,解决了bug但是其深层次的原因还没有弄明白,这里先进行一个记录,以后深入问题再补充。
这是我一开始的代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <complex.h>
#include "cuComplex.h"
#include <typeinfo> //输出变量类型所需头文件
#include <time.h>
typedef cuDoubleComplex complexd;
using namespace std;
#define pi acos(-1)
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
int N = 70; //阵元个数
int Len = 2400; //采样点数
int theta_N = 1800; //角度步进数
__device__ __host__ complexd operator*(complexd a, complexd b) { return cuCmul(a,b); }
__device__ __host__ complexd operator+(complexd a, complexd b) { return cuCadd(a,b); }
__device__ __host__ complexd operator/(complexd a, complexd b) { return cuCdiv(a,b); }
__device__ __host__ complexd exp_(complexd arg)
{
complexd res; //定义一个复数exp
double s, c;
double e = exp(arg.x);
sincos(arg.y, &s, &c);
res.x = c * e;
res.y = s * e;
return res;
}
/* CUDA核函数 */
__global__ void beamforming_nb(complexd* sig_out, complexd* sig_in,complexd* time_delay,int theta_N,int Len,int N)
{
int row = threadIdx.y + blockDim.y * blockIdx.y;
int col = threadIdx.x + blockDim.x * blockIdx.x;
complexd temp;
complexd add_;
if (row < theta_N && col < Len)
{
for (int i = 0; i < N; i++)
{
//theta_N x Len大小的矩阵sig_out = time_delay矩阵的row行i列 X sig_in矩阵i行col列
add_ = time_delay[row * N + i] * sig_in[col + i * Len];
__syncthreads();
temp = temp + add_;
__syncthreads();
}
__syncthreads();
sig_out[row * Len + col] = temp;
__syncthreads();
temp.x = 0;
temp.y = 0;
__syncthreads();
}
}
int main(int argc, char ** argv)
{
/* 初始参数定义 */
const double c = 1500; //介质声速
//const int N = 70; //传感器数量
const double T = 1; //采样时长
const int FS = 2400; //采样频率
auto LEN = T*FS; //采样点数
double t[Len]; //时间长度
for(int i = 0;i<Len;i++)
{
t[i] = i/LEN;
//cout <<t[i]<<endl; //验证通过
}
const double f0 = 300; //频率为300
const double d = 0.27; //传感器间距为0.27m
const double deg2rad = pi/180; // cos是弧度
const double theta = 60; //目标方位角角度制
const double theta_rad = theta * deg2rad; //目标方位角弧度制
cudaError_t res;
/* 原始信号定义 */
complexd sig_[Len]; //原始信号列表,长度为:采样频率*采样时间
for(int i = 0;i<Len;i++)
{
complexd temp{0,2*pi*f0*t[i]};
sig_[i] = exp_(temp); //原始信号
//cout << cuCreal(sig_[i]) << '+' << cuCimag(sig_[i])<< 'i' <<'\n'; //验证通过
}
/* 加入驾驶向量 */
complexd sig[N*Len]; //定义未加入噪声信号
for(int i = 0;i<N;i++)
{
complexd steer{0,2*pi*f0*cos(theta_rad)*i*d/c}; //驾驶向量
// cout << cuCreal(exp_(steer)) << '+' << cuCimag(exp_(steer))<< 'i' << endl; //验证通过
for(int j = 0;j<Len;j++)
{
sig[i*Len+j] = sig_[j] * exp_(steer); //未加入噪声信号.N*Len
// cout << cuCreal(sig[i*Len+j]) << '+' << cuCimag(sig[i*Len+j])<< 'i' <<'\n'; //验证通过
}
}
/* 加入噪声 */
/* 计算并加入theta_stp */
double theta_n = 1800;
complexd t_delay[theta_N*N];
for(int i = 0;i<theta_N;i++)
{
for(int j = 0 ;j<N;j++)
{
complexd tao{0,j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c};
// cout << j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c << '\t' <<j <<endl; //验证通过
t_delay[i*N + j] = exp_(cuConj(tao)); //第i个角度下的N个补偿,最终得到theta_N*N矩阵
// cout << cuCreal(exp_(cuConj(tao))) << '+' << cuCimag(exp_(cuConj(tao)))<< 'i' <<'\n'; //验证通过
}
}
/* CPU计算 */
double ttt;
clock_t at, bt;
at = clock();
complexd *h_pt;
h_pt = (complexd *)malloc(theta_N*Len*sizeof(complexd));
complexd h_temp;
for(int i = 0 ; i < theta_N; i ++ )
{
for(int j = 0; j < Len ; j ++)
{
for(int k = 0 ; k <N; k ++)
{
h_temp = h_temp + t_delay[i * N + k] * sig[k * Len + j];
}
h_pt[i * Len + j] = h_temp;
h_temp.x = 0;
h_temp.y = 0;
// cout << cuCreal(h_pt[i * Len + j]) << '+' << cuCimag(h_pt[i * Len + j]) << 'i' << '\t' ;
}
// cout << endl;
}
bt = clock();
ttt = double(bt-at)/CLOCKS_PER_SEC;
cout << ttt << "s" << endl;
/* CUDA加速 */
complexd *sig_in;
complexd *time_delay;
complexd *sig_out;
res = cudaMalloc((void**)&sig_in,N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&sig_out,theta_N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&time_delay,theta_N*N*sizeof(complexd));CHECK(res)
res = cudaMemcpy(sig_in,sig,N*Len*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
res = cudaMemcpy(time_delay,t_delay,theta_N*N*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
dim3 threadsPerBlocks(32,32); //先尝试角度采样点个数与采样频率相同的情况
dim3 numBlocks((Len + threadsPerBlocks.x -1)/threadsPerBlocks.x,(theta_N + threadsPerBlocks.y -1)/threadsPerBlocks.y);
double tt;
clock_t a, b;
a = clock();
beamforming_nb<<<numBlocks,threadsPerBlocks>>>(sig_out,sig_in,time_delay,theta_N,Len,N);
cudaDeviceSynchronize();
b = clock();
tt = double(b-a)/CLOCKS_PER_SEC;
cout << tt << "s" << endl;
complexd *pt_sig = NULL; //定义输出列表
pt_sig = (complexd*)malloc(theta_N*Len*sizeof(complexd)); //输出信息
res = cudaMemcpy(pt_sig,sig_out,theta_N*Len*sizeof(complexd),cudaMemcpyDeviceToHost);CHECK(res)
bool is_right = true;
for(int i = 0 ; i <theta_N;i++)
{
for(int j = 0 ; j < Len;j++)
{
if(cuCreal(pt_sig[(i*Len+j)]) != cuCreal(h_pt[i * Len + j]) || cuCimag(pt_sig[(i*Len+j)]) != cuCimag(h_pt[i * Len + j]) )
{
is_right = false;
}
// cout << cuCreal(pt_sig[(i*Len+j)])<< '+' << cuCimag(pt_sig[(i*Len+j)]) << 'i' << '\t';
}
// cout << '\n';
}
printf("The result is %s!\n",is_right?"right":"false");
cout << cuCreal(pt_sig[(2399)])<< '+' << cuCimag(pt_sig[(2399)]) << 'i' << '\n';
cout << cuCreal(h_pt[(2400-1)])<< '+' << cuCimag(h_pt[(2400-1)]) << 'i' << '\n';
// -3.74895500 + 1.3897567i
cudaFree(sig_in);
cudaFree(time_delay);
cudaFree(sig_out);
free(pt_sig);
free(h_pt);
return 0;
}
在最后的几行:
cout << cuCreal(pt_sig[(2399)])<< '+' << cuCimag(pt_sig[(2399)]) << 'i' << '\n';
cout << cuCreal(h_pt[(2400-1)])<< '+' << cuCimag(h_pt[(2400-1)]) << 'i' << '\n';
我将CPU端计算的结果和GPU加速计算的结果输出,得到:
前两行是CPU端计算和GPU端计算的耗时对比,下面为计算结果单个数据对比。可见,在保证CPU端计算结果完全正确的前提下,可以判断GPU加速计算的结果存在错误。
在猜测是线程分配错误之后,我对线程块和网格大小的分配进行了调整:
1. 一开始,网格大小为:(Len+threadsPerBlocks.x-1)/threadsPerBlocks.x和(theta_N+threadsPerBlocks.y-1)/threadsPerBlocks.y,带入数值计算,分别为:(2400+32-1)/32 = 75.97,(1800+32-1)/32 = 57.22,由于dim3类型的三个参数都是int型,因此最终得到的数值为75和57,网格大小为(75,57,1)。
验算:75*32 =2400,57*32 = 1824,可以得知两个方向上的线程索引值都是大于我们最终想要计算的矩阵行列大小(2400和1800)的,且根据global函数中的if判定,可以限制线程的索引值,这一切看起来是多么的合理,但问题貌似就是出在这个地方。
2. 由于不知道BUG的具体原因,题主决定死马当活马医,尝试一下让线程索引值和想要计算的矩阵行列大小恰好一样。因此,将线程块的大小,即threadPerBlocks的值设置为(8,8),刚好能够被1800和2400除尽,1800/8 = 225,2400/8 = 300,进行编译,运行:
可以看到,单个输出是对的了,现在验证所有的输出,增加一个bool类型,代码如下:
bool is_right = true;
for(int i = 0 ; i < theta_N;i++)
{
for(int j =0; j < Len;j++)
{
if((cuCreal(pt_sig[i*Len+j]) - cuCreal(h_pt[(i*Len+j)])) > 1e-9 || (cuCimag(pt_sig[(i*Len+j)]) - cuCimag(h_pt[(i*Len+j)])) > 1e-9)
{
is_right = false;
}
}
}
printf("The result is %s.",is_right? "right":"false");
这里的代码逻辑是分别计算结果实部和虚部的差值,如果插值大于1x10-9,则认定存在误差。将该部分代码放在整个代码的结尾 ,即可输出CPU端和GPU端计算对比结果。
除修改了线程块的大小,还修改了部分编程细节,包括global函数代码段,CPU端计算代码段,最终代码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <complex.h>
#include "cuComplex.h"
#include <typeinfo> //输出变量类型所需头文件
#include <time.h>
typedef cuDoubleComplex complexd;
using namespace std;
#define pi acos(-1)
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
int N = 70; //阵元个数
int Len = 2400; //采样点数
int theta_N = 1800; //角度步进数
__device__ __host__ complexd operator*(complexd a, complexd b) { return cuCmul(a,b); }
__device__ __host__ complexd operator+(complexd a, complexd b) { return cuCadd(a,b); }
__device__ __host__ complexd operator/(complexd a, complexd b) { return cuCdiv(a,b); }
__device__ __host__ complexd exp_(complexd arg)
{
complexd res; //定义一个复数
double s, c;
double e = exp(arg.x);
sincos(arg.y, &s, &c);
res.x = c * e;
res.y = s * e;
return res;
}
/* CUDA核函数 */
__global__ void beamforming_nb(complexd* sig_out, complexd* sig_in,complexd* time_delay,int theta_N,int Len,int N)
{
int row = threadIdx.y + blockDim.y * blockIdx.y;
int col = threadIdx.x + blockDim.x * blockIdx.x;
complexd temp{0,0};
complexd add_{0,0};
if (row < theta_N && col < Len)
{
for (int i = 0; i < N; i++)
{
//theta_N x Len大小的矩阵sig_out = time_delay矩阵的row行i列 X sig_in矩阵i行col列
temp = temp + time_delay[row * N + i] * sig_in[col + i * Len];
}
sig_out[row * Len + col] = temp;
temp = add_;
}
}
int main(int argc, char ** argv)
{
/* 初始参数定义 */
const double c = 1500; //介质声速
const double T = 1; //采样时长
const int FS = 2400; //采样频率
auto LEN = T*FS; //采样点数
double t[Len]; //时间长度
for(int i = 0;i<Len;i++)
{
t[i] = i/LEN;
//cout <<t[i]<<endl; //验证通过
}
const double f0 = 300; //频率为300
const double d = 0.27; //传感器间距为0.27m
const double deg2rad = pi/180; // cos是弧度
const double theta = 60; //目标方位角角度制
const double theta_rad = theta * deg2rad; //目标方位角弧度制
cudaError_t res;
/* 原始信号定义 */
complexd sig_[Len]; //原始信号列表,长度为:采样频率*采样时间
for(int i = 0;i<Len;i++)
{
complexd temp{0,2*pi*f0*t[i]};
sig_[i] = exp_(temp); //原始信号
//cout << cuCreal(sig_[i]) << '+' << cuCimag(sig_[i])<< 'i' <<'\n'; //验证通过
}
/* 加入驾驶向量 */
complexd sig[N*Len]; //定义未加入噪声信号
for(int i = 0;i<N;i++)
{
complexd steer{0,2*pi*f0*cos(theta_rad)*i*d/c}; //驾驶向量
// cout << cuCreal(exp_(steer)) << '+' << cuCimag(exp_(steer))<< 'i' << endl; //验证通过
for(int j = 0;j<Len;j++)
{
sig[i*Len+j] = sig_[j] * exp_(steer); //未加入噪声信号.N*Len
// cout << cuCreal(sig[i*Len+j]) << '+' << cuCimag(sig[i*Len+j])<< 'i' <<'\n'; //验证通过
}
}
/* 计算并加入theta_stp */
double theta_n = 1800;
complexd t_delay[theta_N*N];
for(int i = 0;i<theta_N;i++)
{
for(int j = 0 ;j<N;j++)
{
complexd tao{0,j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c};
t_delay[i*N + j] = exp_(cuConj(tao)); //第i个角度下的N个补偿,最终得到theta_N*N矩阵
}
}
/* CPU计算 */
double ttt;
clock_t at, bt;
at = clock();
complexd *h_pt;
h_pt = (complexd *)malloc(theta_N*Len*sizeof(complexd));
complexd a_temp{0,0};
complexd h_temp{0,0};
for(int i = 0 ; i < theta_N; i ++ )
{
for(int j = 0; j < Len ; j ++)
{
for(int k = 0 ; k <N; k ++)
{
h_temp = h_temp + t_delay[i * N + k] * sig[k * Len + j];
}
h_pt[i * Len + j] = h_temp;
h_temp = a_temp;
}
}
bt = clock();
ttt = double(bt-at)/CLOCKS_PER_SEC;
cout << ttt << "s" << endl;
/* CUDA加速 */
complexd *sig_in;
complexd *time_delay;
complexd *sig_out;
res = cudaMalloc((void**)&sig_in,N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&sig_out,theta_N*Len*sizeof(complexd));CHECK(res)
res = cudaMalloc((void**)&time_delay,theta_N*N*sizeof(complexd));CHECK(res)
res = cudaMemcpy(sig_in,sig,N*Len*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
res = cudaMemcpy(time_delay,t_delay,theta_N*N*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
dim3 threadsPerBlocks(8,8); //先尝试角度采样点个数与采样频率相同的情况
dim3 numBlocks((Len)/threadsPerBlocks.x,(theta_N)/threadsPerBlocks.y);
/*
Jetson Orin 模块包含以下内容:NVIDIA Ampere架构GPU,
具有多达2048个CUDA 核、多达64个Tensor核多达12个Arm A78AE CPU核
*/
double tt;
clock_t a, b;
a = clock();
beamforming_nb<<<numBlocks,threadsPerBlocks>>>(sig_out,sig_in,time_delay,theta_N,Len,N);
cudaDeviceSynchronize();
b = clock();
tt = double(b-a)/CLOCKS_PER_SEC;
cout << tt << "s" << endl;
complexd *pt_sig = NULL; //定义输出列表
pt_sig = (complexd*)malloc(theta_N*Len*sizeof(complexd)); //输出信息
res = cudaMemcpy(pt_sig,sig_out,theta_N*Len*sizeof(complexd),cudaMemcpyDeviceToHost);CHECK(res)
bool is_right = true;
for(int i = 0 ; i < theta_N;i++)
{
for(int j =0; j < Len;j++)
{
if((cuCreal(pt_sig[i*Len+j]) - cuCreal(h_pt[(i*Len+j)])) > 1e-9 || (cuCimag(pt_sig[(i*Len+j)]) - cuCimag(h_pt[(i*Len+j)])) > 1e-9)
{
is_right = false;
cout << "第" << i*Len + j <<"个是错误的。"<< endl;
cout << "GPU:" << cuCreal(pt_sig[(i*Len+j)])<< '+' << cuCimag(pt_sig[(i*Len+j)]) << 'i' << '\n';
cout << "CPU:" << cuCreal(h_pt[(i*Len+j)])<< '+' << cuCimag(h_pt[(i*Len+j)]) << 'i' << '\n';
}
}
}
printf("The result is %s.\n",is_right? "right":"false");
cout << cuCreal(pt_sig[(2399)])<< '+' << cuCimag(pt_sig[(2399)]) << 'i' << '\n';
cout << cuCreal(h_pt[(2400-1)])<< '+' << cuCimag(h_pt[(2400-1)]) << 'i' << '\n';
cudaFree(sig_in);
cudaFree(time_delay);
cudaFree(sig_out);
free(pt_sig);
free(h_pt);
return 0;
}
输出结果:
计算结果正确!
总结
1. 以目前的认知推断,使用GPU进行加速计算时,使用的线程索引数最好与需要计算的数据维度(一维长度、二维长宽)相等,这样才能保证计算结果的正确。
更新:
1. 在进一步对CUDA进行学习后,发现CUDA编写出现错误的主要原因一般都是线程块和网格的大小分配不正确,其大小分配应符合以下原则:
1)线程块大小、网格大小定义要符合dim3变量准则;
2)网格和线程块大小要符合设备状况,不能超过限制。
只要遵循这两个基本原则,按照正常语言逻辑编写的CUDA代码均能正常运行。