下面的程序是我在VS2010下做的,看起来是正确的,但我仔细看运行结果的数据将CPU上执行的结果和GPU上的比较了一下,发现GPU上的程序运行出错了,计算结果不正确,希望哪们大侠给看看,这段程序中存在什么问题。
/*矩阵加法,程序运行通过,运行不稳定,时好时坏,另外效率不高,GPU执行速度很慢 */
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<sdkHelper.h>
#include<cutil_inline.h>//用safecall时要包含该头文件
//Device code
#define N 8
void output(float *arr,int width,int height);
__global__ void MatAdd(const float *A, const float *B, float *C, size_t pitch, int height, int width)
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if(i < height && j < width)
// C[i * pitch/ sizeof(float) + j] = A[i * pitch / sizeof(float) + j] + B[i * pitch / sizeof(float) + j];
C[i*width+j]=A[i*width+j]+B[i*width+j];
}
//Host code
int main()
{
float *h_A,*h_B,*h_C;
int width=N,height=N,i,j;
size_t size=sizeof(float)*width*height;
size_t pitch;
StopWatchInterface *timer = NULL;
//为CPU上的矩阵数组申请内存空间
h_A = (float*)malloc(size);
h_B = (float*)malloc(size);
h_C = (float*)malloc(size);
//初始化数据
for(i=0;i<width;i++)
for(j=0;j<height;j++)
{
h_A[i*width+j]=i+j;
h_B[i*width+j]=2*i+3*j;
h_C[i*width+j]=0;
}
printf("输出A矩阵的前5*5小矩阵块\n");
output(h_A,width,height);
printf("输出B矩阵的前5*5小矩阵块\n");
output(h_B,width,height);
printf("输出C矩阵的前5*5小矩阵块\n");
output(h_C,width,height);
//在CPU上计算两矩阵的加法
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
for(i=0;i<width;i++)
{
for(j=0;j<height;j++)
{
h_C[i*width+j]=h_A[i*width+j]+h_B[i*width+j];
}
}
sdkStopTimer(&timer);
printf("\nCPU Processing time %f(ms)\n",sdkGetTimerValue(&timer));
printf("输出C矩阵的前5*5小矩阵块\n");
output(h_C,width,height);
//定义设备数组指针
float *d_A,*d_B,*d_C;
//为GPU上的矩阵数组申请显存空间
cutilSafeCall(cudaMallocPitch((void**) &d_A, &pitch, sizeof(float) * width, height));
cutilSafeCall(cudaMallocPitch((void**) &d_B, &pitch, sizeof(float) * width, height));
cutilSafeCall(cudaMallocPitch((void**) &d_C, &pitch, sizeof(float) * width, height));
//将CPU上的矩阵数组cpu_A、cpu_B分别拷贝到GPU上的矩阵数组gpu_A、gpu_B中
cutilSafeCall( cudaMemcpy2D( d_A, pitch, h_A, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice));
cutilSafeCall( cudaMemcpy2D( d_B, pitch, h_B, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice));
//调用核函数
int threadsPerBlock=width;
int blocksPerGrid=(width*height+threadsPerBlock-1)/threadsPerBlock;
sdkStartTimer(&timer);
MatAdd<<<blocksPerGrid,threadsPerBlock>>>(d_A,d_B,d_C,pitch,width,height);
sdkStopTimer(&timer);
printf("\nGPU Processing time %f(ms)\n",sdkGetTimerValue(&timer));
//将显存数组gpu_C拷贝会内存数组cpu_C
cutilSafeCall( cudaMemcpy2D( h_C, sizeof(float) * width, d_C, pitch, sizeof(float) * width, height,cudaMemcpyDeviceToHost));
printf("调用GPU执行后\n输出结果矩阵C的前5*5数据\n");
output(h_C,width,height);
getchar();
//释放内存空间
free(h_A);
free(h_B);
free(h_C);
//释放显存空间
cutilSafeCall( cudaFree(d_A));
cutilSafeCall( cudaFree(d_B));
cutilSafeCall( cudaFree(d_C));
return 0;
}
void output(float *arr,int width,int height)
{
int i,j;
for(i=0;i<5;i++)
{
for(j=0;j<5;j++)
{
printf("%-8.1f",*(arr+i*width+j));
}
printf("\n");
}
}