Cuda计算两个数组的和
#include"test.cuh"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
cudaError_t ErrorCheck(cudaError_t error_code, const char* filename, int lineNumber)
{
if (error_code != cudaSuccess)
{
printf("CUDA error:\r\ncode=%d, name=%s, description=%s\r\nfile=%s, line%d\r\n",
error_code, cudaGetErrorName(error_code), cudaGetErrorString(error_code), filename, lineNumber);
return error_code;
}
return error_code;
}
void setGpu()
{
int iDeviceCount = 0;
cudaError_t error = ErrorCheck(cudaGetDeviceCount(&iDeviceCount), __FILE__, __LINE__);
if (error != cudaSuccess || iDeviceCount == 0)
{
printf("No CUDA compatable GPU found! \n");
exit(-1);
}
else
{
printf("The count of GPUs is %d. \n", iDeviceCount);
}
int iDev = 0;
error = ErrorCheck(cudaSetDevice(iDev), __FILE__, __LINE__);
if (error != cudaSuccess)
{
printf("fail to set GPU 0 for computing .\n");
exit(-1);
}
else
{
printf("set GPU 0 for computing.\n");
}
}
void initialData(float* addr, int elemCount)
{
for (int i = 0; i < elemCount; i++)
{
addr[i] = (float)(rand() & 0xFF) / 10.f;
}
return;
}
__global__ void addFromGPU(float *A, float *B, float *C, const int N)
{
const int bid = blockIdx.x;
const int tid = threadIdx.x;
const int id = bid * blockDim.x + tid;
C[id] = A[id] + B[id];
}
int test() {
setGpu();
int iElemCount = 512;
size_t stBytesCount = iElemCount * sizeof(float);
float *fpHost_A, *fpHost_B, *fpHost_C;
fpHost_A = (float*)malloc(stBytesCount);
fpHost_B = (float*)malloc(stBytesCount);
fpHost_C = (float*)malloc(stBytesCount);
if (fpHost_A != NULL && fpHost_B != NULL && fpHost_C != NULL)
{
memset(fpHost_A, 0, stBytesCount);
memset(fpHost_B, 0, stBytesCount);
memset(fpHost_C, 0, stBytesCount);
}
else
{
printf("Fail to allocate host memory! \n");
exit(-1);
}
float *fpDevice_A, *fpDevice_B, *fpDevice_C;
cudaMalloc((float**)&fpDevice_A, stBytesCount);
cudaMalloc((float**)&fpDevice_C, stBytesCount);
cudaMalloc((float**)&fpDevice_B, stBytesCount);
if (fpDevice_A != NULL && fpDevice_B != NULL && fpDevice_C != NULL)
{
cudaMemset(fpDevice_A, 0, stBytesCount);
cudaMemset(fpDevice_B, 0, stBytesCount);
cudaMemset(fpDevice_C, 0, stBytesCount);
}
else
{
printf("fail to allocate memory\n");
free(fpHost_A);
free(fpHost_B);
free(fpHost_C);
exit(-1);
}
srand(666);
initialData(fpHost_A, iElemCount);
initialData(fpHost_B, iElemCount);
cudaMemcpy(fpDevice_A, fpHost_A, stBytesCount, cudaMemcpyHostToDevice);
cudaMemcpy(fpDevice_B, fpHost_B, stBytesCount, cudaMemcpyHostToDevice);
cudaMemcpy(fpDevice_C, fpHost_C, stBytesCount, cudaMemcpyHostToDevice);
dim3 block(32);
dim3 grid(iElemCount / 32);
addFromGPU << <grid, block >> > (fpDevice_A, fpDevice_B, fpDevice_C,iElemCount);
cudaMemcpy(fpHost_C, fpDevice_C, stBytesCount, cudaMemcpyDeviceToHost);
for (int i = 0; i < 512; i++)
{
printf("idx=%2d\tmatrix_A:%.2f\tmatrix_B:%.2f\tresult=%.2f\n", i + 1, fpHost_A[i], fpHost_B[i], fpHost_C[i]);
}
free(fpHost_A);
free(fpHost_B);
free(fpHost_C);
cudaFree(fpDevice_A);
cudaFree(fpDevice_B);
cudaFree(fpDevice_C);
cudaDeviceReset();
return 0;
}