这几天一直在调试cuda下的矩阵相加程序,但一直运行错误,最后才发现是我的NVIDIA不支持double类型,导致出错。
下面是我的代码:
#include "stdio.h"
#include "cuda_runtime.h"
#include "cutil_inline.h"
__global__ void cal(double *D_A,double *D_B,double *D_C,int weidth,int height,size_t pitch)
{
// int row=threadIdx.x;
//int col=threadIdx.y;
int col=threadIdx.x+blockDim.x*blockIdx.x;
int row=threadIdx.y+blockDim.y*blockIdx.y;
if(col<weidth&&row<height)
/*D_C[row*weidth+col]=D_A[row*weidth+col]+D_B[row*weidth+col];
// printf("test printf in kernal code");
// printf("%f ",D_C[row][col]);*/
D_C[row*pitch/sizeof(double)+col]=D_A[row*pitch/sizeof(double)+col]+D_B[row*pitch/sizeof(double)+col];
}
int main()
{
// __global__ void cal(double *,double *,double *,int,int,size_t);
int height=10;
int weidth=3;
//double H_A[height][weidth];
//double H_B[height][weidth];
double *H_A=(double*)malloc(sizeof(double) * weidth * height);
double *H_B=(double *)malloc(sizeof(double)*weidth*height);
double *H_C=(double *)malloc(sizeof(double)*height*weidth);
for(int i=0;i<height;i++)
for(int j=0;j<weidth;j++)
H_C[i*weidth+j]=0;
printf("start input H_A\n");
for(int i=0;i<height;i++)
{
for(int j=0;j<weidth;j++)
{
// printf("%d ",i);
H_A[i*weidth+j]=i+j;
H_B[i*weidth+j]=1;
printf("%f %f ",H_B[i*weidth+j],H_A[i*weidth+j]);
}
printf("\n");
}
printf("finish input\n");
size_t pitch;
double *D_A,*D_B,*D_C;
cudaMallocPitch((void **)&D_A,&pitch,weidth*sizeof(double),height);
cudaMallocPitch((void **)&D_B,&pitch,weidth*sizeof(double),height);
cudaMallocPitch((void **)&D_C,&pitch,weidth*sizeof(double),height);
printf("pitch is %d\n",pitch);
cudaMemcpy2D(D_A,pitch,H_A,weidth*sizeof(double),weidth*sizeof(double),height,cudaMemcpyHostToDevice);
cudaMemcpy2D(D_B,pitch,H_B,weidth*sizeof(double),weidth*sizeof(double),height,cudaMemcpyHostToDevice);
dim3 dimBlock(3,1,1);
dim3 dimGrid(1,10,1);
cal<<<dimGrid,dimBlock>>>(D_A,D_B,D_C,weidth,height,pitch);
/* double **H_C=new double*[height];
for(int i=0;i<height;i++)
{
H_C[i]=new double [weidth];
for(int j=0;j<weidth;j++)
H_C[i][j]=0;
}*/
cudaMemcpy2D(H_C,sizeof(double)*weidth,D_C,pitch,weidth*sizeof(double),height,cudaMemcpyDeviceToHost);
for(int i=0;i<height;i++){
for(int j=0;j<weidth;j++)
printf("%lf ",H_C[i*weidth+j]);
printf("\n");
}
free(H_A);
free(H_B);
free(H_C);
cudaFree(D_A);
cudaFree(D_B);
cudaFree(D_C);
}
以上是我的源代码。利用malloc声明连续的存储空间,自己分配二维数组,这样在读取的时候也是自己对齐元素。
但是结果一直都不对,我开始一点点排除,首先排除了内存与设备存储间传递数据出错的问题,不过GPU内部不支持二维数组直接取下标的读取方式,必须要自己用pitch进行计算读取,另外要拷贝进GPU的数组也不能是指针定义的,很显然,指向内存的地址,明显不是GPU内的地址。这个是指二维数组不能定义成double ** array的形式,然后array指向一个double *[height]的指针数组,每个指针数组中的元素都指向一维数组。这样拷贝到GPU中会出现错误拷贝。
在排除完传递错误后,我开始想是不是GPU分配错误
然后我从网上搜到了以下代码
/**************************************************************************
*矩阵相加的例子
***************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cutil_inline.h>
/************************************************************************
* Init CUDA
************************************************************************/
#if __DEVICE_EMULATION__
bool InitCUDA(void){return true;}
#else
bool InitCUDA(void)
{
int count = 0;
int i = 0;
cudaGetDeviceCount(&count);
if(count == 0) {
fprintf(stderr, "There is no device./n");
return false;
}
for(i = 0; i < count; i++) {
cudaDeviceProp prop;
if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if(prop.major >= 1) {
break;
}
}
}
if(i == count) {
fprintf(stderr, "There is no device supporting CUDA./n");
return false;
}
cudaSetDevice(i);
printf("CUDA initialized./n");
return true;
}
#endif
/*****************************************************************************************************
*kernel函数,矩阵相加
******************************************************************************************************/
__global__ void myKernel(const float *a, const float *b, float *c, size_t pitch, int height, int width)
{
int i = blockIdx.y * blockDim.y + threadIdx.y;
int j = blockIdx.x * blockDim.x + threadIdx.x;
if(i < height && j < width)
c[i * pitch/ sizeof(float) + j] = a[i * pitch / sizeof(float) + j] + b[i * pitch / sizeof(float) + j];
}
int main(int argc, char* argv[])
{
// if(!InitCUDA())
// return 0;
//CPU上的3个矩阵数组
//float *cpu_A;
//float *cpu_B;
//float *cpu_C;
//GPU上的3个矩阵数组
float *gpu_A;
float *gpu_B;
float *gpu_C;
int width = 3; //矩阵的宽度(列数)
int height = 10;//矩阵的高度(行数)
size_t pitch; //GPU数组的pitch
//为CPU上的矩阵数组申请内存空间
// cpu_A = (float*)malloc(sizeof(float) * width * height);
float cpu_A[height][width];
float cpu_B[height][width];
float cpu_C[height][width];
// cpu_B = (float*)malloc(sizeof(float) * width * height);
//cpu_C = (float*)malloc(sizeof(float) * width * height);
//为GPU上的矩阵数组申请显存空间
cutilSafeCall( cudaMallocPitch((void**) &gpu_A, &pitch, sizeof(float) * width, height));
cutilSafeCall( cudaMallocPitch((void**) &gpu_B, &pitch, sizeof(float) * width, height));
cutilSafeCall( cudaMallocPitch((void**) &gpu_C, &pitch, sizeof(float) * width, height));
//将pitch打印
printf("The pitch is: %d\n", pitch);
//为CPU上的矩阵数组初始化
/* for(int r = 0; r < height; ++r){
for(int c = 0; c < width; ++c){
// cpu_A[r * width + c] = r * c;
cpu_A[r][c]=r*c;
cpu_B[r * width + c] = r + c;
cpu_C[r * width + c] = 0.0;
}
}*/
for(int r=0;r<height;r++){
for(int c=0;c<width;c++)
{
cpu_A[r][c]=r*c;
cpu_B[r][c]=r+c;
cpu_C[r][c]=0.0;
}
}
//打印CPU上的矩阵数组
printf("/nCPU_A DATA\n");
for(int r = 0; r < height; ++r){
for(int c = 0; c < width; ++c){
printf("%f/t", cpu_A[r * width + c]);
}
printf("\n");
}
printf("/nCPU_B DATA\n");
for(int r = 0; r < height; ++r){
for(int c = 0; c < width; ++c){
printf("%f/t", cpu_B[r * width + c]);
}
printf("\n");
}
printf("/nCPU_C DATA\n");
for(int r = 0; r < height; ++r){
for(int c = 0; c < width; ++c){
printf("%f/t", cpu_C[r * width + c]);
}
printf("\n");
}
//将CPU上的矩阵数组cpu_A、cpu_B分别拷贝到GPU上的矩阵数组gpu_A、gpu_B中
cutilSafeCall( cudaMemcpy2D( gpu_A, pitch, cpu_A, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice));
cutilSafeCall( cudaMemcpy2D( gpu_B, pitch, cpu_B, sizeof(float) * width, sizeof(float) * width, height, cudaMemcpyHostToDevice));
dim3 Dg(1, 10, 1); //定义整个grid的维度和尺寸
dim3 Db(width, 1, 1); //定义每个block的维度和尺寸
myKernel<<<Dg, Db, 0>>>(gpu_A, gpu_B, gpu_C, pitch, height, width); //调用kernel函数
//将显存数组gpu_C拷贝会内存数组cpu_C
cutilSafeCall( cudaMemcpy2D( cpu_C, sizeof(float) * width, gpu_C, pitch, sizeof(float) * width, height, cudaMemcpyDeviceToHost));
//打印CPU_C数组
printf("/nAfter change CPU_C DAT\/n");
for(int r = 0; r < height; ++r){
for(int c = 0; c < width; ++c){
printf("%f/t", cpu_C[r * width + c]);
}
printf("\n");
}
//释放内存空间
free(cpu_A);
free(cpu_B);
free(cpu_C);
//释放显存空间
cutilSafeCall( cudaFree(gpu_A));
cutilSafeCall( cudaFree(gpu_B));
cutilSafeCall( cudaFree(gpu_C));
//退出CUDA
// CUT_EXIT(argc, argv);
return 0;
}