为什么用了CUDA的程序还没有不用CUDA的程序快?

        最近学长要偶学CUDA,说这个有前途,偶就学了,到现在学了有四五天吧,这两天用CUDA写了一个矩阵同向量相乘,也就是重复做点积的程序,并且与没有用CUDA的程序的运行结果进行对照,发现没用CUDA的反而跑得更快!!!

        以下是代码,这个是用了CUDA的       

#include<afxwin.h>
#include "../common/book.h"
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cstring>

const int N = 8192;
const int threadsPerBlock = 256;
const int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock ;


__global__ void dot( float *a, float *b, float *c ) {
 __shared__ float cache[threadsPerBlock];
 int tid = threadIdx.x + blockIdx.x * blockDim.x;
 int cacheIndex = threadIdx.x;

 float   temp = 0;
 while (tid < N) {
  temp += a[tid] * b[tid];
  tid += blockDim.x * gridDim.x;
 }

 // set the cache values
 cache[cacheIndex] = temp;

 // synchronize threads in this block
 __syncthreads();

 // for reductions, threadsPerBlock must be a power of 2
 // because of the following code
 int i = blockDim.x/2;
 while (i != 0) {
  if (cacheIndex < i)
   cache[cacheIndex] += cache[cacheIndex + i];
  __syncthreads();
  i /= 2;
 }

 if (cacheIndex == 0)
  c[blockIdx.x] = cache[0];
}


int main( void ) {

 clock_t startq, finish;
 startq = clock(); 
 double duration;

 cudaEvent_t     start, stop;
 HANDLE_ERROR( cudaEventCreate( &start ) );
 HANDLE_ERROR( cudaEventCreate( &stop ) );
 HANDLE_ERROR( cudaEventRecord( start, 0 ) );

 
 float   *a,*b,c, *partial_c;
 float   *dev_a, *dev_b, *dev_partial_c;

 //allocate memory on the cpu side
 a = (float*)malloc( N*sizeof(float) );
 b = (float*)malloc( N*sizeof(float) );
 partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );

 // allocate the memory on the GPU
 HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
  N*sizeof(float) ) );
 HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
  N*sizeof(float) ) );
 HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
  blocksPerGrid*sizeof(float) ) );

 // fill in the host memory with data
 int numread;
 int numwritten;
 FILE *fp1, *fp2;
 FILE *stream;
 stream=fopen("e:\\test.txt","w");
 char list1[15000];
 char list2[15000];
 //char list[15000];
 CString s;
 CString temp;
 //float results[15000];
 if( fopen_s( &fp1, "e:\\data", "r+t" ) == 0 )
  {numread = fread( list1, sizeof( char ), 8192, fp1 );
 printf( "Number of items read data = %d\n", numread );
 //printf("Contents of buffer = %.100s\n",list1);
 fclose(fp1);
 }
 else printf( "File data could not be opened\n" );

 for(int i=0;i<N;i++)
 {a[i]=(FLOAT)(list1[i]-48);}

 for(int i=0; i<N;i++)
 {

 
 if( fopen_s( &fp2, "e:\\data8192mul8192", "r+t" ) == 0 )
 {  
  fseek(fp2,i*sizeof(char)*8192L,0);
  numread = fread( list2, sizeof( char ), 8192, fp2 );
     printf( "Number of items read datab= %d\n", numread );
     fclose(fp2);
 }
 else printf( "File data could not be opened\n" );

 //float f=(FLOAT)(list1[0]-48);
 //a[0]=(float)(list1[0]-48);
 //printf("a= %f\n",a[0]);
 //char cd[1];
 //cd[0]=list1[0];
 //float f=atof(&cd[0]);

 //b[0]=(float)list2[0];
  //printf(list1[i]);
  //a[i]=(FLOAT)(list1[i]-48);
  //printf("a= %f\n",a[i]);
 for(int i=0;i<N;i++)
 {b[i]=(FLOAT)(list2[i]-48);}
 
 
 // copy the arrays 'a' and 'b' to the GPU
 HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
  cudaMemcpyHostToDevice ) );
 HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
  cudaMemcpyHostToDevice ) );

 dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
  dev_partial_c );

 // copy the array 'c' back from the GPU to the CPU
 HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
  blocksPerGrid*sizeof(float),
  cudaMemcpyDeviceToHost ) );

 // finish up on the CPU side
 c = 0;
 for (int i=0; i<blocksPerGrid; i++) {
  c += partial_c[i];
 }

 printf("c= %f\n",c);
 //results[i]=c;
 temp.Format("%f\n",c);
  s+=temp;

 //fprintf(stream,"%5d",c);

 };

 LPCTSTR results =s;

 //FILE *stream;
 if ( fopen_s( &stream, "e:\\results.txt", "w+t" ) == 0 )
 {
  numwritten = fwrite( results, sizeof( char ), 8192, stream );
     printf( "Wrote %d items\n", numwritten );
     fclose( stream );
 }
 else
  printf( "Problem opening the file\n" );

 

 HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
 HANDLE_ERROR( cudaEventSynchronize( stop ) );
 float   elapsedTime;
 HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
  start, stop ) );
 printf( "Time to generate:  %3.1f ms\n", elapsedTime );

 HANDLE_ERROR( cudaEventDestroy( start ) );
 HANDLE_ERROR( cudaEventDestroy( stop ) );


 finish = clock(); 
    duration = (double)(finish - startq) / CLOCKS_PER_SEC;
 printf( "%f seconds\n", duration ); 


 // free memory on the gpu side
 HANDLE_ERROR( cudaFree( dev_a ) );
 HANDLE_ERROR( cudaFree( dev_b ) );
 HANDLE_ERROR( cudaFree( dev_partial_c ) );

 // free memory on the cpu side
 free( a );
 free( b );
 free( partial_c );

}

这个程序运行起来要十秒钟左右

以下

 是没有用CUDA的,运行起来只要四秒,比用了CUDA的要短不少啊

#include<afxwin.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include <../common/book.h>

const int N = 8192;

float dot( float *a, float *b)
{
    float c=0 ;
 int j ;
 for ( j=0;j<N;j++)
 {c += a[j]*b[j];}
 return c;
}


int main( void ) {

 clock_t start,finish;
 start=clock(); 
 double duration;
 float   *a,*b,c=0;
 //allocate memory on the cpu side
 a = (float*)malloc( N*sizeof(float) );
 b = (float*)malloc( N*sizeof(float) );
 // fill in the host memory with data
 int numread;
 int numwritten;
 FILE *fp1, *fp2;
 FILE *stream;
 stream=fopen("e:\\test.txt","w");
 char list1[15000];
 char list2[15000];
 CString s;
 CString temp;
    {
        if( fopen_s( &fp1, "e:\\data", "r+t" ) == 0 )
            {
       numread = fread( list1, sizeof( char ), 8192, fp1 );
       numread=0;
                //printf( "Number of items read data = %d\n", numread );
                fclose(fp1);
            }
        else printf( "File data could not be opened\n" );
    }

 for(int i=0;i<N;i++)
     {
    a[i]=(FLOAT)(list1[i]-48);
     }

 for(int i=0; i<N;i++)
   {
  {
   if( fopen_s( &fp2, "e:\\data8192mul8192", "r+t" ) == 0 )
       {  
        fseek(fp2,i*sizeof(char)*8192L,0);
        numread = fread( list2, sizeof( char ), 8192, fp2 );
     numread =0;
        //printf( "Number of items read datab= %d\n", numread );
        fclose(fp2);
       }
      else printf( "File data could not be opened\n" );
  }

  for(int i=0;i<N;i++)
  {b[i]=(FLOAT)(list2[i]-48);}

  c=dot(a,b );
  
     //printf("c= %f\n",c);
     temp.Format("%f\n",c);
     s+=temp;
    };
     LPCTSTR results =s;
        {
  
         if ( fopen_s( &stream, "e:\\resultscpuversion.txt", "w+t" ) == 0 )
          {
            numwritten = fwrite( results, sizeof( char ), 8192, stream );
      numwritten = 0;
            //printf( "Wrote %d items\n", numwritten );
            fclose( stream );
       }
         else   printf( "Problem opening the file\n" );
    }

    finish = clock(); 
    duration = (double)(finish - start) / CLOCKS_PER_SEC;
    printf( "%2.1f\n", duration );
// free memory on the cpu side

free( a );
free( b );
}

这个没用CUDA的程序只用四秒,比没用CUDA的要短6秒,为什么有显卡加速的还没有cpu快呢?大家帮忙分析下。我上面所读入的数据都是连续存储的char型的1,

唉,这个希望一定CUDA要快些啊!

 

 

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值