问题描述
我用nvcc10.0编译我自己写的reduce代码时,意外发现我的代码跑出来的结果是错的。贴上我的reduce代码:
#include "stdlib.h"
#include "stdio.h"
#define block_size_x 1024
void init_arr(float* arr, int nElem)
{
float a = -1.0f;
for(int i = 0; i < nElem; i++)
{
arr[i] = a * i;
a = a * -1;
}
}
__global__ void kernel1(float* arr, int nElem)
{
int gid = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ float tmp[block_size_x];
if(gid < nElem)
{
tmp[threadIdx.x] = arr[gid];
}
else
{
tmp[threadIdx.x] = 0.0;
}
__syncthreads();
//- 循环展开
if(blockDim.x > 512)
{
if(threadIdx.x < 512 && threadIdx.x + 512 < blockDim.x)
{
tmp[threadIdx.x] += tmp[threadIdx.x + 512];
}
}
__syncthreads();
if(blockDim.x > 256)
{
if(threadIdx.x < 256 && threadIdx.x + 256 < blockDim.x)
{
tmp[threadIdx.x] += tmp[threadIdx.x + 256];
}
}
__syncthreads();
if(blockDim.x > 128)
{
if(threadIdx.x < 128 && threadIdx.x + 128 < blockDim.x)
{
tmp[threadIdx.x] += tmp[threadIdx.x + 128];
}
}
__syncthreads();
if(blockDim.x > 64)
{
if(threadIdx.x < 64 && threadIdx.x + 64 < blockDim.x)
{
tmp[threadIdx.x] += tmp[threadIdx.x + 64];
}
}
__syncthreads();
if(blockDim.x > 32)
{
if(threadIdx.x < 32 && threadIdx.x + 32 < blockDim.x)
{
tmp[threadIdx.x] += tmp[threadIdx.x + 32];
}
}
__syncthreads();
if(threadIdx.x >=32)
return;
tmp[threadIdx.x] += tmp[threadIdx.x + 16];
tmp[threadIdx.x] += tmp[threadIdx.x + 8];
tmp[threadIdx.x] += tmp[threadIdx.x + 4];
tmp[threadIdx.x] += tmp[threadIdx.x + 2];
tmp[threadIdx.x] += tmp[threadIdx.x + 1];
if(threadIdx.x == 0)
arr[gid] = tmp[0];
}
int main(int argc, char** argv)
{
float *h_arr, *d_arr;
int nElem = 1<<14;
h_arr = (float*)malloc(nElem * sizeof(float));
cudaMalloc((void**)&d_arr, nElem * sizeof(float));
init_arr(h_arr, nElem);
cudaMemcpy(d_arr, h_arr, nElem * sizeof(float), cudaMemcpyHostToDevice);
dim3 block(block_size_x);
dim3 grid((nElem + block.x - 1) / block.x);
kernel1<<<grid, block>>>(d_arr, nElem);
cudaMemcpy(h_arr, d_arr, nElem * sizeof(float), cudaMemcpyDeviceToHost);
float result = 0;
for(int i = 0; i < grid.x; i++)
{
result += h_arr[i * block.x];
}
printf("theory answer is %d sum is %0.0f\n", nElem / 2, result);
return 0;
}
编译指令和运行结果为:
[mmhe@k057 reduce]$ nvcc -arch=sm_70 reduce1.cu -o test
[mmhe@k057 reduce]$ ./test
theory answer is 8192 sum is -16759296
我反复检查都没有看出问题所在,然后在每一次线程块同步之后都输出整个数组,比较奇怪的是,当我在最后几个reduce环节中加上输出打印代码段之后,程序居然意外的输出了正确的结果,而当我注释掉之后,就又开始计算错误了。
if(threadIdx.x >=32)
return;
tmp[threadIdx.x] += tmp[threadIdx.x + 16];
//- 添加打印tmp数组代码段
tmp[threadIdx.x] += tmp[threadIdx.x + 8];
//- 添加打印tmp数组代码段
tmp[threadIdx.x] += tmp[threadIdx.x + 4];
//- 添加打印tmp数组代码段
tmp[threadIdx.x] += tmp[threadIdx.x + 2];
//- 添加打印tmp数组代码段
tmp[threadIdx.x] += tmp[threadIdx.x + 1];
//- 添加打印tmp数组代码段
}
我忽然意识到,这也许是nvcc对代码做的自动优化导致了代码逻辑出现了错误。带着这个猜想,我用debug形式重新编译,运行,结果果然能够得到正确的结果。
[mmhe@k057 reduce]$ nvcc -g -G -arch=sm_70 reduce1.cu -o test
[mmhe@k057 reduce]$ ./test
theory answer is 8192 sum is 8192
这个事情给我提了个醒,nvcc的自动优化可能会导致代码段出现逻辑错误。