错误记录——CUDA程序运行结果不一致（线程块、网格大小设置问题）

大江东第一深情

已于 2023-08-29 22:22:15 修改

阅读量937

点赞数

文章标签：算法 linux ubuntu 学习矩阵

于 2023-06-01 17:11:56 首次发布

本文链接：https://blog.csdn.net/Hughie999/article/details/130981182

版权

上个星期在写常规窄带波束形成的CUDA加速代码时，发现每次CUDA程序运行的结果都不一致，检查了编程逻辑和方法，却找不到相关问题，一直到这周突然灵光乍现，感觉可能是线程分配的问题，一检查果然，解决了bug但是其深层次的原因还没有弄明白，这里先进行一个记录，以后深入问题再补充。

这是我一开始的代码：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <complex.h>
#include "cuComplex.h"
#include <typeinfo>   //输出变量类型所需头文件
#include <time.h>

typedef cuDoubleComplex complexd;
using namespace std;
#define pi acos(-1)
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}

int  N = 70;  //阵元个数
int Len = 2400;   //采样点数
int theta_N = 1800;  //角度步进数


__device__ __host__ complexd operator*(complexd a, complexd b) { return cuCmul(a,b); } 
__device__ __host__ complexd operator+(complexd a, complexd b) { return cuCadd(a,b); } 
__device__ __host__ complexd operator/(complexd a, complexd b) { return cuCdiv(a,b); } 
__device__ __host__ complexd exp_(complexd arg)
{
   complexd res;    //定义一个复数exp
   double s, c;     
   double e = exp(arg.x);   
   sincos(arg.y, &s, &c);
   res.x = c * e;
   res.y = s * e;
   return res;
}

/*  CUDA核函数  */
__global__ void beamforming_nb(complexd* sig_out, complexd* sig_in,complexd* time_delay,int theta_N,int Len,int N)
{
    int row = threadIdx.y + blockDim.y * blockIdx.y;
    int col = threadIdx.x + blockDim.x * blockIdx.x;
    complexd temp;
    complexd add_;
    if (row < theta_N && col < Len) 
    {
        for (int i = 0; i < N; i++) 
        {
            //theta_N x Len大小的矩阵sig_out = time_delay矩阵的row行i列 X sig_in矩阵i行col列
            add_ = time_delay[row * N + i] * sig_in[col + i * Len];
            __syncthreads();
            temp = temp + add_;  
            __syncthreads();
        }
        __syncthreads();
        sig_out[row * Len + col] = temp;
        __syncthreads();
        temp.x = 0;
        temp.y = 0;
        __syncthreads();
    }
}

 
int main(int argc, char ** argv)
{
    /*  初始参数定义  */
    const double c = 1500;  //介质声速
    //const int N = 70;     //传感器数量
    const double T = 1;     //采样时长
    const int FS = 2400;    //采样频率
    auto LEN = T*FS;        //采样点数
    double t[Len];          //时间长度
    for(int i = 0;i<Len;i++)
    {
    	t[i] = i/LEN;
        //cout <<t[i]<<endl;    //验证通过
    }
    const double f0 = 300;  //频率为300
    const double d = 0.27;  //传感器间距为0.27m
    const double deg2rad = pi/180;      // cos是弧度
    const double theta = 60;            //目标方位角角度制
    const double theta_rad = theta * deg2rad;  //目标方位角弧度制
	cudaError_t res;

    /*  原始信号定义  */
    complexd sig_[Len];      //原始信号列表，长度为：采样频率*采样时间

    for(int i = 0;i<Len;i++)
    {
        complexd temp{0,2*pi*f0*t[i]};
        sig_[i] = exp_(temp);      //原始信号
        //cout << cuCreal(sig_[i]) << '+' << cuCimag(sig_[i])<< 'i' <<'\n';     //验证通过
    }
     
    /*  加入驾驶向量  */
    complexd sig[N*Len];         //定义未加入噪声信号                   
 
    for(int i = 0;i<N;i++)
    {
        complexd steer{0,2*pi*f0*cos(theta_rad)*i*d/c};  //驾驶向量
        // cout << cuCreal(exp_(steer)) << '+' << cuCimag(exp_(steer))<< 'i' << endl;   //验证通过
        for(int j = 0;j<Len;j++)
        {
            sig[i*Len+j] = sig_[j] * exp_(steer);     //未加入噪声信号.N*Len
            // cout << cuCreal(sig[i*Len+j]) << '+' << cuCimag(sig[i*Len+j])<< 'i' <<'\n';  //验证通过
        }
    }

    /*  加入噪声   */ 


    /*  计算并加入theta_stp  */
    double theta_n = 1800;
    complexd t_delay[theta_N*N];
    for(int i = 0;i<theta_N;i++)
    {
        for(int j = 0 ;j<N;j++)
        {
            complexd tao{0,j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c};
            // cout << j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c << '\t' <<j <<endl;   //验证通过
            t_delay[i*N + j] = exp_(cuConj(tao)); //第i个角度下的N个补偿，最终得到theta_N*N矩阵
            // cout << cuCreal(exp_(cuConj(tao))) << '+' << cuCimag(exp_(cuConj(tao)))<< 'i' <<'\n';  //验证通过
        }
    }

    /*  CPU计算    */
    double ttt;  
    clock_t at, bt;
   	at = clock();
    complexd *h_pt;
    h_pt = (complexd *)malloc(theta_N*Len*sizeof(complexd));
    complexd h_temp;
    for(int i = 0 ; i < theta_N; i ++ )
    {
        for(int j = 0; j < Len ;  j ++)
        {
            for(int k = 0 ; k <N; k ++)
            {
                h_temp = h_temp + t_delay[i * N + k] * sig[k * Len + j];
            }
            h_pt[i * Len + j] = h_temp;
            h_temp.x = 0;
            h_temp.y = 0;
            // cout << cuCreal(h_pt[i * Len + j]) << '+' << cuCimag(h_pt[i * Len + j]) << 'i' << '\t' ;
        }
        // cout << endl;
    }
    bt = clock();
    ttt = double(bt-at)/CLOCKS_PER_SEC;
    cout << ttt << "s" << endl;


    
    /*  CUDA加速  */
    complexd *sig_in;
    complexd *time_delay;
    complexd *sig_out;


    res = cudaMalloc((void**)&sig_in,N*Len*sizeof(complexd));CHECK(res)
    res = cudaMalloc((void**)&sig_out,theta_N*Len*sizeof(complexd));CHECK(res)
    res = cudaMalloc((void**)&time_delay,theta_N*N*sizeof(complexd));CHECK(res)

    res = cudaMemcpy(sig_in,sig,N*Len*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
    res = cudaMemcpy(time_delay,t_delay,theta_N*N*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)

    dim3 threadsPerBlocks(32,32);  //先尝试角度采样点个数与采样频率相同的情况
    dim3 numBlocks((Len + threadsPerBlocks.x -1)/threadsPerBlocks.x,(theta_N + threadsPerBlocks.y -1)/threadsPerBlocks.y);

    double tt;  
    clock_t a, b;
   	a = clock();
    beamforming_nb<<<numBlocks,threadsPerBlocks>>>(sig_out,sig_in,time_delay,theta_N,Len,N);
	cudaDeviceSynchronize();
	b = clock();
    tt = double(b-a)/CLOCKS_PER_SEC;
    cout << tt << "s" << endl;


    complexd *pt_sig = NULL;  //定义输出列表

    pt_sig = (complexd*)malloc(theta_N*Len*sizeof(complexd));   //输出信息
    res = cudaMemcpy(pt_sig,sig_out,theta_N*Len*sizeof(complexd),cudaMemcpyDeviceToHost);CHECK(res)

    bool is_right = true;
    for(int i = 0 ; i <theta_N;i++)
    {
        for(int j = 0 ; j < Len;j++)
        {
            if(cuCreal(pt_sig[(i*Len+j)]) != cuCreal(h_pt[i * Len + j]) || cuCimag(pt_sig[(i*Len+j)]) != cuCimag(h_pt[i * Len + j]) )
            {
                is_right = false;
            }
            // cout << cuCreal(pt_sig[(i*Len+j)])<< '+' << cuCimag(pt_sig[(i*Len+j)]) << 'i' << '\t';
            
        }
        // cout << '\n';
    }

    printf("The result is %s!\n",is_right?"right":"false");
    cout << cuCreal(pt_sig[(2399)])<< '+' << cuCimag(pt_sig[(2399)]) << 'i' << '\n';
    cout << cuCreal(h_pt[(2400-1)])<< '+' << cuCimag(h_pt[(2400-1)]) << 'i' << '\n';
    // -3.74895500 + 1.3897567i

    cudaFree(sig_in);
    cudaFree(time_delay);
    cudaFree(sig_out);
    free(pt_sig);
    free(h_pt);
    return 0;
}

在最后的几行：

    cout << cuCreal(pt_sig[(2399)])<< '+' << cuCimag(pt_sig[(2399)]) << 'i' << '\n';
    cout << cuCreal(h_pt[(2400-1)])<< '+' << cuCimag(h_pt[(2400-1)]) << 'i' << '\n';

我将CPU端计算的结果和GPU加速计算的结果输出，得到：

前两行是CPU端计算和GPU端计算的耗时对比，下面为计算结果单个数据对比。可见，在保证CPU端计算结果完全正确的前提下，可以判断GPU加速计算的结果存在错误。

在猜测是线程分配错误之后，我对线程块和网格大小的分配进行了调整：

1. 一开始，网格大小为：(Len+threadsPerBlocks.x-1)/threadsPerBlocks.x和(theta_N+threadsPerBlocks.y-1)/threadsPerBlocks.y，带入数值计算，分别为：（2400+32-1)/32 = 75.97，(1800+32-1)/32 = 57.22，由于dim3类型的三个参数都是int型，因此最终得到的数值为75和57，网格大小为（75，57，1）。

验算：75*32 =2400，57*32 = 1824，可以得知两个方向上的线程索引值都是大于我们最终想要计算的矩阵行列大小（2400和1800）的，且根据global函数中的if判定，可以限制线程的索引值，这一切看起来是多么的合理，但问题貌似就是出在这个地方。

2. 由于不知道BUG的具体原因，题主决定死马当活马医，尝试一下让线程索引值和想要计算的矩阵行列大小恰好一样。因此，将线程块的大小，即threadPerBlocks的值设置为（8，8），刚好能够被1800和2400除尽，1800/8 = 225，2400/8 = 300，进行编译，运行：

可以看到，单个输出是对的了，现在验证所有的输出，增加一个bool类型，代码如下：

    bool is_right = true;
    for(int i = 0 ; i < theta_N;i++)
    {
        for(int j =0; j < Len;j++)
        {
            if((cuCreal(pt_sig[i*Len+j]) - cuCreal(h_pt[(i*Len+j)])) > 1e-9 || (cuCimag(pt_sig[(i*Len+j)]) - cuCimag(h_pt[(i*Len+j)])) > 1e-9)
            {
                is_right = false;
            }
        }
    }
    printf("The result is %s.",is_right? "right":"false");

这里的代码逻辑是分别计算结果实部和虚部的差值，如果插值大于1x10-9，则认定存在误差。将该部分代码放在整个代码的结尾，即可输出CPU端和GPU端计算对比结果。

除修改了线程块的大小，还修改了部分编程细节，包括global函数代码段，CPU端计算代码段，最终代码如下：

 
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include <iostream>
#include <math.h>
#include <stdio.h>
#include <complex.h>
#include "cuComplex.h"
#include <typeinfo>   //输出变量类型所需头文件
#include <time.h>
 
typedef cuDoubleComplex complexd;
using namespace std;
#define pi acos(-1)
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
 
int  N = 70;  //阵元个数
int Len = 2400;   //采样点数
int theta_N = 1800;  //角度步进数
 
 
__device__ __host__ complexd operator*(complexd a, complexd b) { return cuCmul(a,b); } 
__device__ __host__ complexd operator+(complexd a, complexd b) { return cuCadd(a,b); } 
__device__ __host__ complexd operator/(complexd a, complexd b) { return cuCdiv(a,b); } 
 
__device__ __host__ complexd exp_(complexd arg)
{
   complexd res;    //定义一个复数
   double s, c;     
   double e = exp(arg.x);   
   sincos(arg.y, &s, &c);
   res.x = c * e;
   res.y = s * e;
   return res;
}
 
/*  CUDA核函数  */
__global__ void beamforming_nb(complexd* sig_out, complexd* sig_in,complexd* time_delay,int theta_N,int Len,int N)
{
    int row = threadIdx.y + blockDim.y * blockIdx.y;
    int col = threadIdx.x + blockDim.x * blockIdx.x;
    complexd temp{0,0};
    complexd add_{0,0};
    if (row < theta_N && col < Len) 
    {
        for (int i = 0; i < N; i++) 
        {
            //theta_N x Len大小的矩阵sig_out = time_delay矩阵的row行i列 X sig_in矩阵i行col列
            temp = temp +  time_delay[row * N + i] * sig_in[col + i * Len];
        }
        sig_out[row * Len + col] = temp;
        temp = add_;
    }
 
}
 
 
int main(int argc, char ** argv)
{
    /*  初始参数定义  */
    const double c = 1500;  //介质声速
    const double T = 1;     //采样时长
    const int FS = 2400;    //采样频率
    auto LEN = T*FS;        //采样点数
    double t[Len];          //时间长度
    for(int i = 0;i<Len;i++)
    {
        t[i] = i/LEN;
        //cout <<t[i]<<endl;    //验证通过
    }
    const double f0 = 300;  //频率为300
    const double d = 0.27;  //传感器间距为0.27m
    const double deg2rad = pi/180;      // cos是弧度
    const double theta = 60;            //目标方位角角度制
    const double theta_rad = theta * deg2rad;  //目标方位角弧度制
    cudaError_t res;
 
    /*  原始信号定义  */
    complexd sig_[Len];      //原始信号列表，长度为：采样频率*采样时间
 
    for(int i = 0;i<Len;i++)
    {
        complexd temp{0,2*pi*f0*t[i]};
        sig_[i] = exp_(temp);      //原始信号
        //cout << cuCreal(sig_[i]) << '+' << cuCimag(sig_[i])<< 'i' <<'\n';     //验证通过
    }
     
    /*  加入驾驶向量  */
    complexd sig[N*Len];         //定义未加入噪声信号                   
 
    for(int i = 0;i<N;i++)
    {
        complexd steer{0,2*pi*f0*cos(theta_rad)*i*d/c};  //驾驶向量
        // cout << cuCreal(exp_(steer)) << '+' << cuCimag(exp_(steer))<< 'i' << endl;   //验证通过
        for(int j = 0;j<Len;j++)
        {
            sig[i*Len+j] = sig_[j] * exp_(steer);     //未加入噪声信号.N*Len
            // cout << cuCreal(sig[i*Len+j]) << '+' << cuCimag(sig[i*Len+j])<< 'i' <<'\n';  //验证通过
        }
    }
 
    /*  计算并加入theta_stp  */
    double theta_n = 1800;
    complexd t_delay[theta_N*N];
    for(int i = 0;i<theta_N;i++)
    {
        for(int j = 0 ;j<N;j++)
        {
            complexd tao{0,j*2*pi*f0*cos(i*(180/theta_n)*deg2rad)*d/c};
            t_delay[i*N + j] = exp_(cuConj(tao)); //第i个角度下的N个补偿，最终得到theta_N*N矩阵
        }
    }
 
    /*  CPU计算    */
    double ttt;  
    clock_t at, bt;
    at = clock();
    complexd *h_pt;
    h_pt = (complexd *)malloc(theta_N*Len*sizeof(complexd));
    complexd a_temp{0,0};
    complexd h_temp{0,0};
    for(int i = 0 ; i < theta_N; i ++ )
    {
        for(int j = 0; j < Len ;  j ++)
        {
            for(int k = 0 ; k <N; k ++)
             {
                h_temp = h_temp + t_delay[i * N + k] * sig[k * Len + j];
            }
            h_pt[i * Len + j] = h_temp;
            h_temp = a_temp;
         }
 
     }
    bt = clock();
    ttt = double(bt-at)/CLOCKS_PER_SEC;
    cout << ttt << "s" << endl;
 
 
    
    /*  CUDA加速  */
    complexd *sig_in;
    complexd *time_delay;
    complexd *sig_out;
 
 
    res = cudaMalloc((void**)&sig_in,N*Len*sizeof(complexd));CHECK(res)
    res = cudaMalloc((void**)&sig_out,theta_N*Len*sizeof(complexd));CHECK(res)
    res = cudaMalloc((void**)&time_delay,theta_N*N*sizeof(complexd));CHECK(res)
 
    res = cudaMemcpy(sig_in,sig,N*Len*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
    res = cudaMemcpy(time_delay,t_delay,theta_N*N*sizeof(complexd),cudaMemcpyHostToDevice);CHECK(res)
 
    dim3 threadsPerBlocks(8,8);  //先尝试角度采样点个数与采样频率相同的情况
    dim3 numBlocks((Len)/threadsPerBlocks.x,(theta_N)/threadsPerBlocks.y);
 
    /*
    Jetson Orin 模块包含以下内容：NVIDIA Ampere架构GPU，
    具有多达2048个CUDA 核、多达64个Tensor核多达12个Arm A78AE CPU核
    */
 
    double tt;  
    clock_t a, b;
    a = clock();
    beamforming_nb<<<numBlocks,threadsPerBlocks>>>(sig_out,sig_in,time_delay,theta_N,Len,N);
    cudaDeviceSynchronize();
    b = clock();
    tt = double(b-a)/CLOCKS_PER_SEC;
    cout << tt << "s" << endl;
 
 
    complexd *pt_sig = NULL;  //定义输出列表
 
    pt_sig = (complexd*)malloc(theta_N*Len*sizeof(complexd));   //输出信息
    res = cudaMemcpy(pt_sig,sig_out,theta_N*Len*sizeof(complexd),cudaMemcpyDeviceToHost);CHECK(res)
 
    bool is_right = true;
    for(int i = 0 ; i < theta_N;i++)
    {
        for(int j =0; j < Len;j++)
        {
            if((cuCreal(pt_sig[i*Len+j]) - cuCreal(h_pt[(i*Len+j)])) > 1e-9 || (cuCimag(pt_sig[(i*Len+j)]) - cuCimag(h_pt[(i*Len+j)])) > 1e-9)
            {
                is_right = false;
                cout << "第" << i*Len + j <<"个是错误的。"<< endl;
                cout << "GPU:" << cuCreal(pt_sig[(i*Len+j)])<< '+' << cuCimag(pt_sig[(i*Len+j)]) << 'i' << '\n'; 
                cout << "CPU:" << cuCreal(h_pt[(i*Len+j)])<< '+' << cuCimag(h_pt[(i*Len+j)]) << 'i' << '\n';
            }
        }
    }
    printf("The result is %s.\n",is_right? "right":"false");
 
 
 
    cout << cuCreal(pt_sig[(2399)])<< '+' << cuCimag(pt_sig[(2399)]) << 'i' << '\n';
    cout << cuCreal(h_pt[(2400-1)])<< '+' << cuCimag(h_pt[(2400-1)]) << 'i' << '\n';

 
    cudaFree(sig_in);
    cudaFree(time_delay);
    cudaFree(sig_out);
    free(pt_sig);
    free(h_pt);
    return 0;
}

输出结果：