使用CUDA时间不长,遇到了棘手的问题,希望高手指点,谢谢。
用CUDA核函数计算的结果与用C++循环的结果有差别,主要是float最后2位。程序中,h_spec
数组用于存储C++循环的计算结果,d_spec用于存储核函数计算结果,将d_spec拷贝到数组h_d_spec,比较h_spec与h_d_spec。当h_spec与h_d_spec均有16641(129*129)个元素时,h_spec与h_d_spec中不相等的元素数为6723。不相等的元素差值很小,例如:
h_d_spec[0] = 4.3036454e-008, h_spec[0] = 4.3036462e-008, 差值: -7.1054274e-015
h_d_spec[1] = 4.5101640e-008, h_spec[1] = 4.5101658e-008, 差值: -1.7763568e-014
不知道这种错误产生的原因到底是什么。原因一,可能是程序写得有问题;原因二,CPU和GPU浮点精度有差别。
程序所用GPU: Geforce GTX 460, Geforce GT430。程序代码如下:
// complex_test.cpp
#include "stdafx.h"
#include
#include
#include "iostream"
#include "stdlib.h"
#define _PI 3.14159265358979323846
#define _PI2 _PI*2.0
#define _GRAVITY 9.81
#define _GRAVITY2 96.2361
extern "C" void DEVICE_spec_func(float * spec, float A, float L, float Speed, float2 Dir_vector, int dim, float damping);
float spec_func(float A,
float2 Dir,
float Speed,
float2 K,
float reflDamping)
{
float k2 = K.x * K.x + K.y * K.y;
float Dir_len2 = Dir.x * Dir.x + Dir.y *Dir.y;
if (k2 == 0.f)
return 0.f;
float k4 = k2 * k2;
float KdotW =K.x * Dir.x + K.y * Dir.y;
if (KdotW < 0.f)
return 0.0f;
float KdotWhat = KdotW*KdotW/(k2*Dir_len2);
float Speed4 = pow(Speed, 4.0f);
float eterm = exp( -1.0f * _GRAVITY2 / (k2*Speed4) ) / k4;
float specResult = A * eterm * KdotWhat ;
return specResult;
}
void test_spec()
{
float L = 256.0f;
float speed = 20.0f;
float2 dir_vector;
dir_vector.x = cos(_PI/4);
dir_vector.x = sin(_PI/4);
int _N = 128;
float _A = pow(_PI2/L, 2.0) * 3.48/1000.0;
float2 K;
float * h_spec = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //host array
float * d_spec = NULL; //device array
cudaMalloc((void**)&d_spec,(_N+1)*(_N+1)*sizeof(float));
float * h_d_spec = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //copy device array to the array
float * differ = (float *) malloc((_N+1)*(_N+1)*sizeof(float)); //difference
int num = 0;
int h_nonezero_num = 0;
int d_nonezero_num = 0;
int h_d_zero = 0;
for(int y=0; y<=_N; y++)
{
K.y = _PI2 / L * (float)(-_N/2 + y);
for(int x=0; x<=_N; x++)
{
K.x = _PI2 / L * (float)(-_N/2 + x);
int offset = y * (_N+1) + x;
h_spec[offset] = spec_func(_A, dir_vector, speed, K, 0.0);
if(h_spec[offset]!=0.0)
{
h_nonezero_num+=1;
}
}
}
DEVICE_spec_func(d_spec, _A, L, speed, dir_vector, _N+1, 0.0f);
cudaMemcpy(h_d_spec, d_spec, (_N+1)*(_N+1)*sizeof(float), cudaMemcpyDeviceToHost);
for(int y=0; y<=_N; y++)
{
for(int x=0; x<=_N; x++)
{
int offset = y * (_N+1) + x;
differ[offset] = h_d_spec[offset] - h_spec[offset];
if(h_d_spec[offset]!=0.0)
{
d_nonezero_num += 1;
}
if(differ[offset] != 0.0)
{
num += 1;
}
if(h_spec[offset]!=0.0 && h_d_spec[offset]==0.0 )
{
h_d_zero += 1;
}
}
}
printf("Not equal: %d \n", num);
}
int _tmain(int argc, _TCHAR* argv[])
{
test_spec();
return 0;
}
//kernel.cu
#include "cuda_runtime.h"
#include "device_functions_decls.h"
#include "cufft.h"
#include "cufftw.h"
#include "curand.h"
//#include "math.h"
#define _PI 3.14159265358979323846
#define _PI2 _PI*2.0
#define _GRAVITY 9.81
#define _GRAVITY2 96.2361
__device__ float d_spec_func(const float A,
const float2 Dir,
const float Speed,
const float2 K,
const float reflDamping)
{
float k2 = K.x * K.x + K.y * K.y;
float Dir_len2 = Dir.x * Dir.x + Dir.y *Dir.y;
if (k2 == 0.f)
return 0.f;
float k4 = k2 * k2;
float KdotW =K.x * Dir.x + K.y * Dir.y;
if (KdotW < 0.f)
return 0.0f;
float KdotWhat = KdotW*KdotW/(k2*Dir_len2);
float Speed4 = pow(Speed, 4.0f);
float eterm = exp( -1.0f * _GRAVITY2 / (k2*Speed4) ) / k4;
float specResult = A * eterm * KdotWhat ;
//float specResult = KdotWhat;
return specResult;
}
__global__ void kernel_spec(float * spec,
float A,
float L,
float Speed,
float2 Dir_vector,
int dim,
float damping)
{
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * dim;
float2 K;
K.x = _PI2 / L * (float)(-(dim-1)/2 + x);
K.y = _PI2 / L * (float)(-(dim-1)/2 + y);
spec[offset] = d_spec_func(A, Dir_vector, Speed, K, damping);
}
extern "C"
void DEVICE_spec_func(float * spec,
float A,
float L,
float Speed,
float2 Dir_vector,
int dim,
float damping)
{
dim3 threads(1,1,1);
dim3 blocks(dim, dim, 1);
kernel_spec<<>>(spec, A, L, Speed, Dir_vector, dim, damping);
}