#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <cuda_runtime.h>
#define MIN(a, b) ((a) < (b))? (a):(b)
float* h_A;
float* h_B;
float* h_C;
float* d_A;
float* d_B;float* d_C;
__global__ void DotMulVet(const float* A,const float* B,float* C,int N)
{
int index = blockIdx.x*blockDim.x+threadIdx.x;
const int offset = gridDim.x * blockDim.x;
while (index < N)
{
C[index] = A[index] * B[index];
index += offset;
}
}
int main()
{
int N=1024;
int i;
int mem_size=sizeof(float)*N;
cudaEvent_t start, finish;
float costTime;
cudaEventCreate(&start);
cudaEventCreate(&finish);
printf("Start to malloc host memory...\n");
h_A=(float*)malloc(mem_size);
h_B=(float*)malloc(mem_size);
h_C=(float*)malloc(mem_size);
for(i=0;i<N;i++)
{
h_A[i]=(float)(rand()/RAND_MAX);
h_B[i]=(float)(rand()/RAND_MAX);
}
printf("Start to malloc device memory...\n");
cudaMalloc((void**)&d_A,mem_size);
cudaMalloc((void**)&d_B,mem_size);
cudaMalloc((void**)&d_C,mem_size);
printf("Start to copy host memory data to device memory...\n");
cudaMemcpy(d_A,h_A,mem_size,cudaMemcpyHostToDevice);
cudaMemcpy(d_B,h_B,mem_size,cudaMemcpyHostToDevice);
int threadPerBlock = 256;
int blockPerGrid = MIN(16, (N + threadPerBlock - 1) / threadPerBlock);
printf("Start to compute with GPU\n");
cudaEventRecord(start,0);
DotMulVet<<< blockPerGrid , threadPerBlock>>>(d_A,d_B,d_C,N);
cudaEventRecord(finish, 0);
cudaEventSynchronize(finish);
cudaEventElapsedTime(&costTime, start, finish);
//copy result from device to host
cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost);
printf("Check result with CPU...\n");
int errorCounts = 0;
for(i=0;i<N;i++)
{
float ans=h_A[i]*h_B[i];
if (fabs(ans - h_C[i])>1E-6)
{
errorCounts++;
}
}
printf("Result: %s, errorCounts: %d\n",(0 == errorCounts) ? "Correct":"Wrong", errorCounts);
printf("Cost Time : %f\n",costTime);
free(h_A);
free(h_B);
free(h_C);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
return 0;
}
cuda编程入门示例1---两个向量对应元素相乘
最新推荐文章于 2023-01-02 20:11:10 发布