为什么用了CUDA的程序还没有不用CUDA的程序快？

最新推荐文章于 2024-05-21 20:12:30 发布

chenxiang11011

最新推荐文章于 2024-05-21 20:12:30 发布

阅读量1k

点赞数 1

文章标签： cuda float list fp stream file

本文链接：https://blog.csdn.net/chenxiang11011/article/details/6615490

版权

最近学长要偶学CUDA，说这个有前途，偶就学了，到现在学了有四五天吧，这两天用CUDA写了一个矩阵同向量相乘，也就是重复做点积的程序，并且与没有用CUDA的程序的运行结果进行对照，发现没用CUDA的反而跑得更快！！！

以下是代码，这个是用了CUDA的

#include<afxwin.h>
#include "../common/book.h"
#include <math.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>
#include <cstring>

const int N = 8192;
const int threadsPerBlock = 256;
const int blocksPerGrid = (N+threadsPerBlock-1) / threadsPerBlock ;

__global__ void dot( float *a, float *b, float *c ) {
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;

float   temp = 0;
while (tid < N) {
  temp += a[tid] * b[tid];
  tid += blockDim.x * gridDim.x;
}

// set the cache values
cache[cacheIndex] = temp;

// synchronize threads in this block
__syncthreads();

// for reductions, threadsPerBlock must be a power of 2
// because of the following code
int i = blockDim.x/2;
while (i != 0) {
  if (cacheIndex < i)
   cache[cacheIndex] += cache[cacheIndex + i];
  __syncthreads();
  i /= 2;
}

if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}

int main( void ) {

clock_t startq, finish;
startq = clock();
double duration;

cudaEvent_t start, stop;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );

float *a,*b,c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;

//allocate memory on the cpu side
a = (float*)malloc( N*sizeof(float) );
b = (float*)malloc( N*sizeof(float) );
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );

// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
  N*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
  N*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
  blocksPerGrid*sizeof(float) ) );

// fill in the host memory with data
int numread;
int numwritten;
FILE *fp1, *fp2;
FILE *stream;
stream=fopen("e:\\test.txt","w");
char list1[15000];
char list2[15000];
//char list[15000];
CString s;
CString temp;
//float results[15000];
if( fopen_s( &fp1, "e:\\data", "r+t" ) == 0 )
{numread = fread( list1, sizeof( char ), 8192, fp1 );
printf( "Number of items read data = %d\n", numread );
//printf("Contents of buffer = %.100s\n",list1);
fclose(fp1);
}
else printf( "File data could not be opened\n" );

for(int i=0;i<N;i++)
{a[i]=(FLOAT)(list1[i]-48);}

for(int i=0; i<N;i++)
{

if( fopen_s( &fp2, "e:\\data8192mul8192", "r+t" ) == 0 )
{
  fseek(fp2,i*sizeof(char)*8192L,0);
  numread = fread( list2, sizeof( char ), 8192, fp2 );
     printf( "Number of items read datab= %d\n", numread );
     fclose(fp2);
}
else printf( "File data could not be opened\n" );

//float f=(FLOAT)(list1[0]-48);
//a[0]=(float)(list1[0]-48);
//printf("a= %f\n",a[0]);
//char cd[1];
//cd[0]=list1[0];
//float f=atof(&cd[0]);

//b[0]=(float)list2[0];
  //printf(list1[i]);
  //a[i]=(FLOAT)(list1[i]-48);
  //printf("a= %f\n",a[i]);
for(int i=0;i<N;i++)
{b[i]=(FLOAT)(list2[i]-48);}

// copy the arrays 'a' and 'b' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, N*sizeof(float),
  cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( dev_b, b, N*sizeof(float),
  cudaMemcpyHostToDevice ) );

dot<<<blocksPerGrid,threadsPerBlock>>>( dev_a, dev_b,
dev_partial_c );

// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
blocksPerGrid*sizeof(float),
cudaMemcpyDeviceToHost ) );

// finish up on the CPU side
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}

printf("c= %f\n",c);
//results[i]=c;
temp.Format("%f\n",c);
s+=temp;

//fprintf(stream,"%5d",c);

};

LPCTSTR results =s;

//FILE *stream;
if ( fopen_s( &stream, "e:\\results.txt", "w+t" ) == 0 )
{
  numwritten = fwrite( results, sizeof( char ), 8192, stream );
     printf( "Wrote %d items\n", numwritten );
     fclose( stream );
}
else
  printf( "Problem opening the file\n" );

HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
float elapsedTime;
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
printf( "Time to generate: %3.1f ms\n", elapsedTime );

HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );

finish = clock();
duration = (double)(finish - startq) / CLOCKS_PER_SEC;
printf( "%f seconds\n", duration );

// free memory on the gpu side
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_partial_c ) );

// free memory on the cpu side
free( a );
free( b );
free( partial_c );

}

这个程序运行起来要十秒钟左右

以下

是没有用CUDA的，运行起来只要四秒，比用了CUDA的要短不少啊

#include<afxwin.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <time.h>
#include <../common/book.h>

const int N = 8192;

float dot( float *a, float *b)
{
float c=0 ;
int j ;
for ( j=0;j<N;j++)
{c += a[j]*b[j];}
return c;
}

int main( void ) {

clock_t start,finish;
start=clock();
double duration;
float   *a,*b,c=0;
//allocate memory on the cpu side
a = (float*)malloc( N*sizeof(float) );
b = (float*)malloc( N*sizeof(float) );
// fill in the host memory with data
int numread;
int numwritten;
FILE *fp1, *fp2;
FILE *stream;
stream=fopen("e:\\test.txt","w");
char list1[15000];
char list2[15000];
CString s;
CString temp;
    {
        if( fopen_s( &fp1, "e:\\data", "r+t" ) == 0 )
            {
       numread = fread( list1, sizeof( char ), 8192, fp1 );
       numread=0;
                //printf( "Number of items read data = %d\n", numread );
                fclose(fp1);
            }
        else printf( "File data could not be opened\n" );
    }

for(int i=0;i<N;i++)
     {
    a[i]=(FLOAT)(list1[i]-48);
     }

for(int i=0; i<N;i++)
   {
  {
   if( fopen_s( &fp2, "e:\\data8192mul8192", "r+t" ) == 0 )
       {
        fseek(fp2,i*sizeof(char)*8192L,0);
        numread = fread( list2, sizeof( char ), 8192, fp2 );
     numread =0;
        //printf( "Number of items read datab= %d\n", numread );
        fclose(fp2);
       }
      else printf( "File data could not be opened\n" );
  }

for(int i=0;i<N;i++)
{b[i]=(FLOAT)(list2[i]-48);}

  c=dot(a,b );

     //printf("c= %f\n",c);
     temp.Format("%f\n",c);
     s+=temp;
    };
     LPCTSTR results =s;
        {

         if ( fopen_s( &stream, "e:\\resultscpuversion.txt", "w+t" ) == 0 )
          {
            numwritten = fwrite( results, sizeof( char ), 8192, stream );
      numwritten = 0;
            //printf( "Wrote %d items\n", numwritten );
            fclose( stream );
       }
         else   printf( "Problem opening the file\n" );
    }

    finish = clock();
    duration = (double)(finish - start) / CLOCKS_PER_SEC;
    printf( "%2.1f\n", duration );
// free memory on the cpu side

free( a );
free( b );
}

这个没用CUDA的程序只用四秒，比没用CUDA的要短6秒，为什么有显卡加速的还没有cpu快呢？大家帮忙分析下。我上面所读入的数据都是连续存储的char型的1，

唉，这个希望一定CUDA要快些啊！

chenxiang11011

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
为什么用了CUDA的程序还没有不用CUDA的程序快？

最近学长要偶学CUDA，说这个有前途，偶就学了，到现在学了有四五天吧，这两天用CUDA写了一个矩阵同向量相乘，也就是重复做点积的程序，并且与没有用CUDA的程序的运行结果进行对照，发现没用CUDA的反而跑得更快！！！以下是代码，这个是用了CUDA的
复制链接

扫一扫