cuda 学习笔记(二)cuda于cpu时间对比

 

1、矩阵加法:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <math.h>

#define Row 1024
#define Col 1024

long long g_cpu_calc_count;
//定义的kernel函数
__global__ void addKernel(int **C, int **A, int **B)
{
	int idx = threadIdx.x + blockDim.x * blockIdx.x;
	int idy = threadIdx.y + blockDim.y * blockIdx.y;
	if (idx < Col && idy < Row)
	{
		C[idy][idx] = A[idy][idx] + B[idy][idx];
	}
}

void matrix_add_cpu(int** A_ptr, int** B_ptr, int** C_ptr, int width)
{
	g_cpu_calc_count = 0;
	for (size_t i = 0; i < width; i++)
	{
		for (size_t j = 0; j < width; j++)
		{
			C_ptr[i][j] = A_ptr[i][j] + B_ptr[i][j];
			g_cpu_calc_count++;
		}
	}
}

int main()
{
	int *A, **A_ptr, *B, **B_ptr, *C, **C_ptr, **d_A_ptr, **d_B_ptr, **d_C_ptr, *d_A, *d_B, *d_C;
	int total_size = Row * Col * sizeof(int);
	//在CPU上分配内存
	A = (int*)malloc(total_size);
	B = (int*)malloc(total_size);
	C = (int*)malloc(total_size);
	A_ptr = (int**)malloc(Row * sizeof(int*));
	B_ptr = (int**)malloc(Row * sizeof(int*));
	C_ptr = (int**)malloc(Row * sizeof(int*));
	//CPU一维数组初始化
	for (size_t i = 0; i < Row * Col; i++)
	{
		A[i] = 80;
		B[i] = 20;
	}
	for (size_t i = 0; i < Row; i++)
	{
		A_ptr[i] = A + Col * i;
		B_ptr[i] = B + Col * i;
		C_ptr[i] = C + Col * i;
	}
	const clock_t cpu_begin_time_2 = clock(); //开始计时
	matrix_add_cpu(A_ptr, B_ptr, C_ptr, Col); //CPU计算
	float ms = float(clock() - cpu_begin_time_2);
	std::cout << "矩阵加法运算CPU单核运算总次数:" << g_cpu_calc_count << std::endl;
	printf("CPU cost_time: %.2f ms \n", ms);

	//GPU计算
	// set value
	for (int i = 0; i < Row * Col; i++)
	{
		A[i] = 90;
		B[i] = 10;
	}
	// 将主机指针A指向设备数据位置,目的是让设备二级指针能够指向设备数据一级指针
	for (size_t i = 0; i < Row; i++)
	{
		A_ptr[i] = A + Col * i;
		B_ptr[i] = B + Col * i;
		C_ptr[i] = C + Col * i;
	}
	//set value
	for (int i = 0; i < Row * Col; i++)
	{
		A[i] = 90;
		B[i] = 10;
	}

	const clock_t gpu_begin_time_2 = clock(); //开始计时
	// malloc device memory
	cudaMalloc((void**)&d_A_ptr, sizeof(int**) * Row);
	cudaMalloc((void**)&d_B_ptr, sizeof(int**) * Row);
	cudaMalloc((void**)&d_C_ptr, sizeof(int**) * Row);
	cudaMalloc((void**)&d_A, sizeof(int**) * Row*Col);
	cudaMalloc((void**)&d_B, sizeof(int**) * Row*Col);
	cudaMalloc((void**)&d_C, sizeof(int**) * Row*Col);
	//memcpy host to device
	cudaMemcpy(d_A_ptr, A_ptr, sizeof(int*)* Row, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B_ptr, B_ptr, sizeof(int*)* Row, cudaMemcpyHostToDevice);
	cudaMemcpy(d_C_ptr, C_ptr, sizeof(int*)* Row, cudaMemcpyHostToDevice);
	cudaMemcpy(d_A, A, sizeof(int)* Row, cudaMemcpyHostToDevice);
	cudaMemcpy(d_B, B, sizeof(int)* Row, cudaMemcpyHostToDevice);
	dim3 threadPerBlock_2(16, 16); // 定义变量作为kernel的Grid
	dim3 blockNumber_2((Col + threadPerBlock_2.x - 1) / threadPerBlock_2.x, (Row + threadPerBlock_2.y - 1) / threadPerBlock_2.y); // 定义变量作为kernel的Block
	printf("Block(%d, %d) Grid(%d, %d).\n", threadPerBlock_2.x, threadPerBlock_2.y, blockNumber_2.x, blockNumber_2.y);
	addKernel << <blockNumber_2, threadPerBlock_2 >> > (d_C_ptr, d_A_ptr, d_B_ptr);
	// memcpy device to host
	cudaMemcpy(C_ptr, d_C_ptr, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
	ms = float(clock() - gpu_begin_time_2);
	std::cout << "矩阵加法运算所有线程数:" << threadPerBlock_2.x * threadPerBlock_2.y * blockNumber_2.x * blockNumber_2.y << std::endl;
	std::cout << "矩阵加法运算GPU单线程运算次数:1" << std::endl;
	std::cout << "矩阵加法运算GPU拷贝到GPU数据字节数:" << sizeof(int*) * Row * 3 + sizeof(int) * Row * Col * 2 << std::endl;
	std::cout << "矩阵加法运算GPU拷贝到CPU数据字节数:" << sizeof(int) * Row * Col << std::endl;
	printf("GPU cost_time: %.2f ms \n", ms);
	//释放内存
	free(A);
	free(B);
	free(C);
	free(A_ptr);
	free(B_ptr);
	free(C_ptr);
	cudaFree(d_A);
	cudaFree(d_B);
	cudaFree(d_C);
	cudaFree(d_A_ptr);
	cudaFree(d_B_ptr);
	cudaFree(d_C_ptr);
	system("pause");
	return 0;
}

运行结果:

矩阵加法运算CPU单核运算总次数:1048576
CPU cost_time: 2.00 ms
Block(16, 16) Grid(64, 64).
矩阵加法运算所有线程数:1048576
矩阵加法运算GPU单线程运算次数:1
矩阵加法运算GPU拷贝到GPU数据字节数:8413184
矩阵加法运算GPU拷贝到CPU数据字节数:4194304
GPU cost_time: 439.00 ms
请按任意键继续. . .

2、矩阵乘法

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <math.h>

#define Row  1024
#define Col 1024

long long g_cpu_calc_count;

__global__ void matrix_mul_gpu(int *M, int* N, int* P, int width)
{

	int i = threadIdx.x + blockDim.x * blockIdx.x;
	int j = threadIdx.y + blockDim.y * blockIdx.y;

	int sum = 0;
	for (int k = 0; k<width; k++)
	{
		int a = M[j*width + k];
		int b = N[k*width + i];
		sum += a*b;
	}
	P[j*width + i] = sum;
}

void matrix_mul_cpu(int* M, int* N, int* P, int width)
{
	g_cpu_calc_count = 0;
	for (int i = 0; i < width; i++) {
		for (int j = 0; j<width; j++)
		{
			int sum = 0;
			for (int k = 0; k<width; k++)
			{
				int a = M[i*width + k];
				int b = N[k*width + j];
				sum += a*b;
				g_cpu_calc_count++;
			}
			P[i*width + j] = sum;
		}
	}
}

int main()
{
	//malloc host memory
	int *A = (int *)malloc(sizeof(int) * Row * Col);
	int *B = (int *)malloc(sizeof(int) * Row * Col);
	int *C = (int *)malloc(sizeof(int) * Row * Col);
	//malloc device memory
	int *d_dataA, *d_dataB, *d_dataC;
	cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
	cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
	cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
	//set value
	for (int i = 0; i < Row*Col; i++) {
		A[i] = 90;
		B[i] = 10;
	}

	// CPU计算
	const clock_t cpu_begin_time = clock();
	matrix_mul_cpu(A, B, C, Col);

	float ms = float(clock() - cpu_begin_time);
	std::cout << "矩阵乘法运算CPU单核总运算次数:" << g_cpu_calc_count << std::endl;
	printf("CPU cost_time: %.2f ms \n", ms);

	//GPU计算
	//set value
	for (int i = 0; i < Row*Col; i++) {
		A[i] = 90;
		B[i] = 10;
	}
	const clock_t gpu_begin_time = clock();
	//memcpy host to device
	cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
	cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
	dim3 threadPerBlock(16, 16);
	dim3 blockNumber((Col + threadPerBlock.x - 1) / threadPerBlock.x, (Row + threadPerBlock.y - 1) / threadPerBlock.y);
	printf("Block(%d,%d)   Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
	// gpu start calc
	matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);

	//拷贝数据:GPU->CPU
	cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
	ms = float(clock() - gpu_begin_time);
	std::cout << "矩阵乘法运算所有线程数:" << threadPerBlock.x*threadPerBlock.y * blockNumber.x * blockNumber.y << std::endl;
	std::cout << "矩阵乘法运算GPU单线程运算次数:" << Col << std::endl;
	std::cout << "矩阵乘法运算CPU拷贝到GPU数据字节数:" << sizeof(int) * Row * Col * 2 << std::endl;
	std::cout << "矩阵乘法运算GPU拷贝到CPU数据字节数:" << sizeof(int) * Row * Col << std::endl;
	printf("GPU cost_time: %.2f ms \n", ms);

	//释放内存
	free(A);
	free(B);
	free(C);
	cudaFree(d_dataA);
	cudaFree(d_dataB);
	cudaFree(d_dataC);
	system("pause");
	return 0;
}

运行结果:

矩阵乘法运算CPU单核总运算次数:1073741824
CPU cost_time: 1743.00 ms
Block(16,16)   Grid(64,64).
矩阵乘法运算所有线程数:1048576
矩阵乘法运算GPU单线程运算次数:1024
矩阵乘法运算CPU拷贝到GPU数据字节数:8388608
矩阵乘法运算GPU拷贝到CPU数据字节数:4194304
GPU cost_time: 10.00 ms
请按任意键继续. . .

结论:CUDA编程调用GPU运算,会增加CPU与GPU传输数据的开销,也就是说使用CUDA编程GPU加速,本身就会出现一部分额外开销;若CPU与GPU交互的数据量一定,则在GPU上执行的计算量越大,则使用GPU加速的效果越明显。因此不可盲目地使用CUDA的GPU加速。

 

参考:【CUDA编程系列】CUDA编程基本入门学习笔记

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

落花逐流水

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值