VS2010下矩阵加法

最新推荐文章于 2024-06-21 11:25:43 发布

wufenxia

最新推荐文章于 2024-06-21 11:25:43 发布

阅读量2.3k

点赞数

分类专栏： GPU编程

本文链接：https://blog.csdn.net/wufenxia/article/details/8279873

版权

GPU编程专栏收录该内容

8 篇文章 0 订阅

订阅专栏

下面的程序是我在VS2010下做的，看起来是正确的，但我仔细看运行结果的数据将CPU上执行的结果和GPU上的比较了一下，发现GPU上的程序运行出错了，计算结果不正确，希望哪们大侠给看看，这段程序中存在什么问题。

/*矩阵加法，程序运行通过,运行不稳定，时好时坏，另外效率不高，GPU执行速度很慢 */
#include<stdio.h>
#include<stdlib.h>
#include<cuda_runtime.h>
#include<sdkHelper.h>
#include<cutil_inline.h>//用safecall时要包含该头文件
//Device code

#define N 8
void output(float *arr,int width,int height);

__global__ void MatAdd(const float *A, const float *B, float *C, size_t pitch, int height, int width)
{
       int i = blockIdx.y * blockDim.y + threadIdx.y;
       int j = blockIdx.x * blockDim.x + threadIdx.x;

       if(i < height && j < width)
      // C[i * pitch/ sizeof(float) + j] = A[i * pitch / sizeof(float) + j] + B[i * pitch / sizeof(float) + j];
       C[i*width+j]=A[i*width+j]+B[i*width+j];
}

//Host code
int main()
{
float *h_A,*h_B,*h_C;
int width=N,height=N,i,j;
size_t size=sizeof(float)*width*height;
size_t pitch;
StopWatchInterface *timer = NULL;

//为CPU上的矩阵数组申请内存空间
     h_A = (float*)malloc(size);
     h_B = (float*)malloc(size);
     h_C = (float*)malloc(size);

//初始化数据
for(i=0;i<width;i++)
  for(j=0;j<height;j++)
  {
   h_A[i*width+j]=i+j;
   h_B[i*width+j]=2*i+3*j;
   h_C[i*width+j]=0;
  }

printf("输出A矩阵的前5*5小矩阵块\n");
output(h_A,width,height);
printf("输出B矩阵的前5*5小矩阵块\n");
output(h_B,width,height);
printf("输出C矩阵的前5*5小矩阵块\n");
output(h_C,width,height);
//在CPU上计算两矩阵的加法
sdkCreateTimer(&timer);
sdkStartTimer(&timer);
for(i=0;i<width;i++)
{
  for(j=0;j<height;j++)
  {
   h_C[i*width+j]=h_A[i*width+j]+h_B[i*width+j];
  }
}
sdkStopTimer(&timer);
printf("\nCPU Processing time %f(ms)\n",sdkGetTimerValue(&timer));

printf("输出C矩阵的前5*5小矩阵块\n");
output(h_C,width,height);
//定义设备数组指针
float *d_A,*d_B,*d_C;
//为GPU上的矩阵数组申请显存空间
cutilSafeCall(cudaMallocPitch((void**) &d_A, &pitch, sizeof(float) * width, height));
    cutilSafeCall(cudaMallocPitch((void**) &d_B, &pitch, sizeof(float) * width, height));
    cutilSafeCall(cudaMallocPitch((void**) &d_C, &pitch, sizeof(float) * width, height));

//将CPU上的矩阵数组cpu_A、cpu_B分别拷贝到GPU上的矩阵数组gpu_A、gpu_B中
    cutilSafeCall( cudaMemcpy2D( d_A, pitch, h_A, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice));
    cutilSafeCall( cudaMemcpy2D( d_B, pitch, h_B, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice));

//调用核函数
int threadsPerBlock=width;
int blocksPerGrid=(width*height+threadsPerBlock-1)/threadsPerBlock;
sdkStartTimer(&timer);
MatAdd<<<blocksPerGrid,threadsPerBlock>>>(d_A,d_B,d_C,pitch,width,height);
sdkStopTimer(&timer);
printf("\nGPU Processing time %f(ms)\n",sdkGetTimerValue(&timer));
//将显存数组gpu_C拷贝会内存数组cpu_C
    cutilSafeCall( cudaMemcpy2D( h_C, sizeof(float) * width, d_C, pitch, sizeof(float) * width, height,cudaMemcpyDeviceToHost));

printf("调用GPU执行后\n输出结果矩阵C的前5*5数据\n");
output(h_C,width,height);
getchar();
//释放内存空间
free(h_A);
free(h_B);
free(h_C);
//释放显存空间
    cutilSafeCall( cudaFree(d_A));
    cutilSafeCall( cudaFree(d_B));
    cutilSafeCall( cudaFree(d_C));

return 0;
}
void output(float *arr,int width,int height)
{
int i,j;
for(i=0;i<5;i++)
{
  for(j=0;j<5;j++)
  {
   printf("%-8.1f",*(arr+i*width+j));
  }
  printf("\n");
}
}