向量相加其三(CUDA+C语言实现)

向量相加其三(CUDA+C语言实现)

测试机配置

硬件信息
CPU
厂家:Intel
型号:Intel®Pentium® CPU 5405U
核数:4
频率:2.3GHz 2.3GHz
指令集:不支持AVX2/AVX512

GPU1
厂家:Intel
型号:Intel®UHD Graphics 610
显存容量:120MB
显存频率:300MHz
显存带宽:17.1GB/s

GPU2
厂家:NVIDIA
型号:NVIDIA GeForce MX250
显存容量:2GB
显存频率:1519MHz
显存带宽:48.1GB/s

C语言源代码

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <omp.h>
#include <immintrin.h>

__global__ 
void vecadd_cuda(int *d_c, int *d_a, int *d_b, int n)
{
	const int tid = blockIdx.x * blockDim.x + threadIdx.x;
	if (tid < n) d_c[tid] = d_a[tid] + d_b[tid];
}

int main(int argc, char **argv) {
	int n = atoi(argv[1]);
	int repeat = 10;
	//create vectors
	int *a = (int *)malloc(sizeof(int) * n);
	int *b = (int *)malloc(sizeof(int) * n);
	int *c = (int *)malloc(sizeof(int) * n);
	for(int i = 0; i < n; i++) {
		a[i] = 1;
		b[i] = 2;
	}
	
	struct timeval t1, t2;
	int *d_a, *d_b, *d_c;
	cudaMalloc((void **)&d_a, n * sizeof(int));
	cudaMalloc((void **)&d_b, n * sizeof(int));
	cudaMalloc((void **)&d_c, n * sizeof(int));
	cudaMemcpy(d_a, a, n * sizeof(int), cudaMemcpyHostToDevice);
	cudaMemcpy(d_a, b, n * sizeof(int), cudaMemcpyHostToDevice);
	//serial C
	gettimeofday(&t1, NULL);
	for (int i = 0; i < repeat; i++) {
		for (int j = 0; j < n; j++)	c[j] = a[j] + b[j];
	}
	gettimeofday(&t2, NULL);
	double time_serial = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
	time_serial /= repeat;
	time_serial /= 1000.0;
	printf("C serial takes %f sec\n", time_serial);
	//OpenMP parallel version
	gettimeofday(&t1, NULL);
	for (int i = 0; i < repeat; i++) {
		#pragma omp parallel for
		for (int j = 0; j < n; j++)	c[j] = a[j] + b[j];
	}
	gettimeofday(&t2, NULL);
	time_serial = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
	time_serial /= repeat;
	time_serial /= 1000.0;
	printf("C OpenMP takes %f sec\n", time_serial);
	//OpenMP+AVX2 parallel version
	gettimeofday(&t1, NULL);
	for (int i = 0; i < repeat; i++) {
		int loop = n/8;
		#pragma omp parallel for
		for (int j = 0; j < loop; j++)
		{
			__m256i aavx2 = _mm256_loadu_si256((__m256i*)(&a[j * 8]));
			__m256i bavx2 = _mm256_loadu_si256((__m256i*)(&b[j * 8]));
			__m256i cavx2 = _mm256_add_epi32(aavx2, bavx2);
			_mm256_storeu_si256((__m256i*)(&c[j * 8]), cavx2);
		}
	}
	gettimeofday(&t2, NULL);
	time_serial = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
	time_serial /= repeat;
	time_serial /= 1000.0;
	printf("C OpenMP+avx2 takes %f sec\n", time_serial);
	//CUDA parallel version
	gettimeofday(&t1, NULL);
	for (int i = 0; i < repeat; i++)
	{
		int nthreads = 128;
		int nblocks = ceil((double)n / (double)nthreads);
		vecadd_cuda<<<nblocks, nthreads>>>(d_c, d_a, d_b, n);
	}
	cudaDeviceSynchronize();
	gettimeofday(&t2, NULL);
	time_serial = (t2.tv_sec - t1.tv_sec) * 1000.0 + (t2.tv_usec - t1.tv_usec) / 1000.0;
	time_serial /= repeat;
	time_serial /= 1000.0;
	printf("CUDA takes %f sec\n", time_serial);
	cudaFree(d_a);
	cudaFree(d_b);
	cudaFree(d_c);
	free(a);
	free(b);
	free(c);
	return 0;
}

运行结果及其分析

kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 10
C serial takes 0.0000000000 sec
CUDA takes 0.0000048000 sec
kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 100
C serial takes 0.0000001000 sec
CUDA takes 0.0000047000 sec
kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 1000
C serial takes 0.0000003000 sec
CUDA takes 0.0000047000 sec
kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 10000
C serial takes 0.0000047000 sec
CUDA takes 0.0000046000 sec
kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 100000
C serial takes 0.0000516000 sec
CUDA takes 0.0000321000 sec
kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 1000000
C serial takes 0.0011572000 sec
CUDA takes 0.0002842000 sec
kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 10000000
C serial takes 0.0123769000 sec
CUDA takes 0.0027254000 sec
kendrick@kendrick-LAPTOP:~/Desktop/vecadd-c$ ./vecadd 100000000
C serial takes 0.1254904000 sec
CUDA takes 0.0283268000 sec

由此可见cuda将数据放在GPU上计算将大大加速向量计算,只是将数据从CPU中传入GPU这一过程将耗费很多的事件,上面的运行结果中并未体现出来。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值