c语言中cuda核函数,CUDA核函数计算结果与C++循环计算结果又差别，求助。

最新推荐文章于 2023-07-08 21:39:54 发布

老卫15034297801

最新推荐文章于 2023-07-08 21:39:54 发布

阅读量336

点赞数

文章标签： c语言中cuda核函数

使用CUDA时间不长，遇到了棘手的问题，希望高手指点，谢谢。

用CUDA核函数计算的结果与用C++循环的结果有差别，主要是float最后2位。程序中，h_spec

数组用于存储C++循环的计算结果，d_spec用于存储核函数计算结果，将d_spec拷贝到数组h_d_spec，比较h_spec与h_d_spec。当h_spec与h_d_spec均有16641(129*129)个元素时，h_spec与h_d_spec中不相等的元素数为6723。不相等的元素差值很小，例如：

h_d_spec[0] = 4.3036454e-008, h_spec[0] = 4.3036462e-008, 差值: -7.1054274e-015

h_d_spec[1] = 4.5101640e-008, h_spec[1] = 4.5101658e-008, 差值: -1.7763568e-014

不知道这种错误产生的原因到底是什么。原因一，可能是程序写得有问题；原因二，CPU和GPU浮点精度有差别。

程序所用GPU: Geforce GTX 460, Geforce GT430。程序代码如下：

// complex_test.cpp

#include "stdafx.h"

#include

#include "iostream"

#include "stdlib.h"

#define _PI 3.14159265358979323846

#define _PI2 _PI*2.0

#define _GRAVITY 9.81

#define _GRAVITY2 96.2361

extern "C" void DEVICE_spec_func(float * spec, float A, float L, float Speed, float2 Dir_vector, int dim, float damping);

float spec_func(float A,

float2 Dir,

float Speed,

float2 K,

float reflDamping)

{

float k2 = K.x * K.x + K.y * K.y;

float Dir_len2 = Dir.x * Dir.x + Dir.y *Dir.y;

if (k2 == 0.f)

return 0.f;

float k4 = k2 * k2;

float KdotW =K.x * Dir.x + K.y * Dir.y;

if (KdotW < 0.f)

return 0.0f;

float KdotWhat = KdotW*KdotW/(k2*Dir_len2);

float Speed4 = pow(Speed, 4.0f);

float eterm = exp( -1.0f * _GRAVITY2 / (k2*Speed4) ) / k4;

float specResult = A * eterm * KdotWhat ;

return specResult;

}

void test_spec()

{

float L = 256.0f;

float speed = 20.0f;

float2 dir_vector;

dir_vector.x = cos(_PI/4);

dir_vector.x = sin(_PI/4);

int _N = 128;

float _A = pow(_PI2/L, 2.0) * 3.48/1000.0;

float2 K;

float * h_spec = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //host array

float * d_spec = NULL; //device array

cudaMalloc((void**)&d_spec,(_N+1)*(_N+1)*sizeof(float));

float * h_d_spec = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //copy device array to the array

float * differ = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //difference

int num = 0;

int h_nonezero_num = 0;

int d_nonezero_num = 0;

int h_d_zero = 0;

for(int y=0; y<=_N; y++)

{

K.y = _PI2 / L * (float)(-_N/2 + y);

for(int x=0; x<=_N; x++)

{

K.x = _PI2 / L * (float)(-_N/2 + x);

int offset = y * (_N+1) + x;

h_spec[offset] = spec_func(_A, dir_vector, speed, K, 0.0);

if(h_spec[offset]!=0.0)

{

h_nonezero_num+=1;

}

DEVICE_spec_func(d_spec, _A, L, speed, dir_vector, _N+1, 0.0f);

cudaMemcpy(h_d_spec, d_spec, (_N+1)*(_N+1)*sizeof(float), cudaMemcpyDeviceToHost);

for(int y=0; y<=_N; y++)

{

for(int x=0; x<=_N; x++)

{

int offset = y * (_N+1) + x;

differ[offset] = h_d_spec[offset] - h_spec[offset];

if(h_d_spec[offset]!=0.0)

{

d_nonezero_num += 1;

}

if(differ[offset] != 0.0)

{

num += 1;

}

if(h_spec[offset]!=0.0 && h_d_spec[offset]==0.0 )

{

h_d_zero += 1;

}

printf("Not equal: %d \n", num);

}

int _tmain(int argc, _TCHAR* argv[])

{

test_spec();

return 0;

}

//kernel.cu

#include "cuda_runtime.h"

#include "device_functions_decls.h"

#include "cufft.h"

#include "cufftw.h"

#include "curand.h"

//#include "math.h"

#define _PI 3.14159265358979323846

#define _PI2 _PI*2.0

#define _GRAVITY 9.81

#define _GRAVITY2 96.2361

__device__ float d_spec_func(const float A,

const float2 Dir,

const float Speed,

const float2 K,

const float reflDamping)

{

float k2 = K.x * K.x + K.y * K.y;

float Dir_len2 = Dir.x * Dir.x + Dir.y *Dir.y;

if (k2 == 0.f)

return 0.f;

float k4 = k2 * k2;

float KdotW =K.x * Dir.x + K.y * Dir.y;

if (KdotW < 0.f)

return 0.0f;

float KdotWhat = KdotW*KdotW/(k2*Dir_len2);

float Speed4 = pow(Speed, 4.0f);

float eterm = exp( -1.0f * _GRAVITY2 / (k2*Speed4) ) / k4;

float specResult = A * eterm * KdotWhat ;

//float specResult = KdotWhat;

return specResult;

}

__global__ void kernel_spec(float * spec,

float A,

float L,

float Speed,

float2 Dir_vector,

int dim,

float damping)

{

int x = threadIdx.x + blockIdx.x * blockDim.x;

int y = threadIdx.y + blockIdx.y * blockDim.y;

int offset = x + y * dim;

float2 K;

K.x = _PI2 / L * (float)(-(dim-1)/2 + x);

K.y = _PI2 / L * (float)(-(dim-1)/2 + y);

spec[offset] = d_spec_func(A, Dir_vector, Speed, K, damping);

}

extern "C"

void DEVICE_spec_func(float * spec,

float A,

float L,

float Speed,

float2 Dir_vector,

int dim,

float damping)

{

dim3 threads(1,1,1);

dim3 blocks(dim, dim, 1);

kernel_spec<<>>(spec, A, L, Speed, Dir_vector, dim, damping);

}

老卫15034297801

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
c语言中cuda核函数,CUDA核函数计算结果与C++循环计算结果又差别，求助。

使用CUDA时间不长，遇到了棘手的问题，希望高手指点，谢谢。用CUDA核函数计算的结果与用C++循环的结果有差别，主要是float最后2位。程序中，h_spec数组用于存储C++循环的计算结果，d_spec用于存储核函数计算结果，将d_spec拷贝到数组h_d_spec，比较h_spec与h_d_spec。当h_spec与h_d_spec均有16641(129*129)个元素时，h_spec与h_...
复制链接

扫一扫