cuda编程入门示例1---两个向量对应元素相乘

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h> 


#define MIN(a, b) ((a) < (b))? (a):(b)


float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;float* d_C;


__global__ void DotMulVet(const float* A,const float* B,float* C,int N)
{    
    int index = blockIdx.x*blockDim.x+threadIdx.x;    
const int offset = gridDim.x * blockDim.x;


    while (index < N)
    {
C[index] = A[index] * B[index];
index += offset;
    }
}


int main()
{    
    int N=1024;    
    int i;    
    int mem_size=sizeof(float)*N;    
    cudaEvent_t   start, finish;    
float costTime;


    cudaEventCreate(&start);    
    cudaEventCreate(&finish);   


    printf("Start to malloc host memory...\n");
    h_A=(float*)malloc(mem_size);    
    h_B=(float*)malloc(mem_size);    
    h_C=(float*)malloc(mem_size);  


    for(i=0;i<N;i++)    
    {        
        h_A[i]=(float)(rand()/RAND_MAX);        
        h_B[i]=(float)(rand()/RAND_MAX);    
    }    
    
printf("Start to malloc device memory...\n");
    cudaMalloc((void**)&d_A,mem_size);    
    cudaMalloc((void**)&d_B,mem_size);    
    cudaMalloc((void**)&d_C,mem_size);    


printf("Start to copy host memory data to device memory...\n");
    cudaMemcpy(d_A,h_A,mem_size,cudaMemcpyHostToDevice);    
    cudaMemcpy(d_B,h_B,mem_size,cudaMemcpyHostToDevice);
    
int threadPerBlock = 256;
int blockPerGrid = MIN(16, (N + threadPerBlock - 1) / threadPerBlock);
    
printf("Start to compute with GPU\n");
    cudaEventRecord(start,0);    
    DotMulVet<<< blockPerGrid , threadPerBlock>>>(d_A,d_B,d_C,N);   
cudaEventRecord(finish, 0);
cudaEventSynchronize(finish);
cudaEventElapsedTime(&costTime, start, finish);


//copy result from device to host
cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost);
           
printf("Check result with CPU...\n");
int errorCounts = 0;
    for(i=0;i<N;i++)    
    {        
        float ans=h_A[i]*h_B[i];        
if (fabs(ans - h_C[i])>1E-6)
{
errorCounts++;
}
    }      
    
    printf("Result: %s, errorCounts: %d\n",(0 == errorCounts) ? "Correct":"Wrong", errorCounts);    
    printf("Cost Time : %f\n",costTime);  
    
    free(h_A);    
    free(h_B);    
    free(h_C); 
       
    cudaFree(d_A);    
    cudaFree(d_B);    
    cudaFree(d_C);


return 0;
}

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值