数组相加_GPU实现

最新推荐文章于 2021-07-19 21:33:30 发布

Mikowoo007

最新推荐文章于 2021-07-19 21:33:30 发布

阅读量251

点赞数 2

分类专栏： cuda

本文链接：https://blog.csdn.net/Mikowoo007/article/details/114108927

版权

cuda 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

CPU实现

#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include<time.h>

void sumArraysOnHost(float *A, float *B, float *C, const int N){
	for (int idx=0; idx<N; idx++){
		C[idx] = A[idx] + B[idx];
		printf("%f \t %f \t %f \n",C[idx],A[idx],B[idx]);
	}
}

void initialData(float *ip, int size){
	time_t t;
	srand((unsigned int) time(&t));

	for (int i=0; i<size; i++){
		ip[i] = (float)( rand() & 0xFF) / 10.0f;
	}
}

int main(int argc, char **argv){
	int nElem = 1024;
	size_t nBytes = nElem * sizeof(float);

	float *h_A, *h_B, *h_C;
	h_A = (float *)malloc(nBytes);
	h_B = (float *)malloc(nBytes);
	h_C = (float *)malloc(nBytes);

	initialData(h_A, nElem);
	initialData(h_B, nElem);

	sumArraysOnHost(h_A, h_B, h_C, nElem);

	free(h_A);
	free(h_B);
	free(h_C);
	
	return(0);
}

编译

nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum

-Xcompiler 用于指定命令行选项是指向C编译器还是预处理器
std=c99 C程序按照C99标准编写的

在这里插入图片描述
执行

./sum

在这里插入图片描述

GPU实现

#include<cuda_runtime.h>
#include<stdio.h>

#define CHECK(call)															\
{																			\
	const cudaError_t error = call;											\
	if(error != cudaSuccess){												\
		printf("Error: %s:%d, ", __FILE__,__LINE__);						\
		printf("code:%d, reason:%s\n",error, cudaGetErrorString(error));	\
		exit(1);															\
	}																		\
}
void checkResult(float *hostRef, float *gpuRef,const int N){
	double epsilon = 1.0E-8;
	bool match = 1;
	for(int i=0;i<N;i++){
		if(abs(hostRef[i] - gpuRef[i]) > epsilon){
			match = 0;
			printf("Arrays do not match:\n");
			printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);
			break;
		}
	}
	if(match)	printf("Arrays match.\n\n");
}
void initialData(float *ip,int size){
	// generate different seed for random number
	time_t t;
	srand((unsigned) time(&t));

	for(int i=0;i<size;i++){
		ip[i] = (float)(rand() & 0xFF)/10.0f;
	}
}
void sumArraysOnHost(float *A, float *B, float *C, const int N){
	for(int idx=0;idx<N;idx++){
		C[idx] = A[idx] + B[idx];
	}
}
__global__ void sumArraysOnGPU(float *A, float *B, float *C){
	int i = threadIdx.x;
	C[i] = A[i] + B[i];
}
int main(int argc, char **argv){
	printf("%s Starting... \n",argv[0]);
	//set up device
	int dev = 0;
	cudaSetDevice(dev);
	//set up data size of vectors
	int nElem = 32;
	printf("Vector size %d\n", nElem);
	//malloc host memory
	size_t nBytes = nElem * sizeof(float);
	
	float *h_A, *h_B, *hostRef, *gpuRef;
	h_A = (float *)malloc(nBytes);
	h_B = (float *)malloc(nBytes);
	hostRef = (float *)malloc(nBytes);
	gpuRef = (float *)malloc(nBytes);
	//initialize data at host side
	initialData(h_A, nElem);
	initialData(h_B, nElem);

	memset(hostRef, 0, nBytes);
	memset(gpuRef, 0, nBytes);
	// malloc device global memory
	float *d_A, *d_B, *d_C;
	cudaMalloc((float**)&d_A, nBytes);
	cudaMalloc((float**)&d_B, nBytes);
	cudaMalloc((float**)&d_C, nBytes);

	cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
	// invoke kernel at host side
	dim3 block (nElem);
	dim3 grid (nElem/block.x);

	sumArraysOnGPU<<<grid,block>>>(d_A, d_B, d_C);
	printf("Execution configuration <<<%d, %d>>>\n",grid.x,block.x);

	// copy kernel result back to host side
	cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
	// add vector at host side for result checks
	sumArraysOnHost(h_A, h_B, hostRef, nElem);
	// check device results
	checkResult(hostRef, gpuRef, nElem);
	// free device global memory
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	// free host memory
	free(h_A);
	free(h_B);
	free(hostRef);
	free(gpuRef);

	return(0);
}

编译

 nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum

在这里插入图片描述

执行

./addvector

在这里插入图片描述

代码详解

内存管理

cudaMalloc(void** devPtr, size_t size)

用于执行GPU内存分配
该函数负责向设备分配一定字节的线程内存，并以devPtr的形式返回指向所分配内存的指针

cudaMemcpy(void* dst, const void* src, size_t count, cudaMemcpyKind kind)

函数负责主机和设备之间的数据传输
函数从src指向的源存储区复制一定数量的字节到dst指向的目标存储区，复制方向由kind指定
这个函数以同步方式执行，函数返回以及传输操作完成之前主机应用程序是阻塞的.
除内核启动之后的cuda调用都会返回一个错误枚举类型 cuda Error_t
kind种类：

cudaMemcpyHostToHost
cudaMemcpyHostToDevice
cudaMemcpyDeviceToHost
cudaMemcpyDeviceToDevice

Mikowoo007

关注

2
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
数组相加_GPU实现

CPU实现#include<stdio.h>#include<stdlib.h>#include<string.h>#include<time.h>void sumArraysOnHost(float *A, float *B, float *C, const int N){ for (int idx=0; idx<N; idx++){ C[idx] = A[idx] + B[idx]; printf("%f \t %f \t %f
复制链接

扫一扫

专栏目录