1、矩阵加法:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#define Row 1024
#define Col 1024
long long g_cpu_calc_count;
//定义的kernel函数
__global__ void addKernel(int **C, int **A, int **B)
{
int idx = threadIdx.x + blockDim.x * blockIdx.x;
int idy = threadIdx.y + blockDim.y * blockIdx.y;
if (idx < Col && idy < Row)
{
C[idy][idx] = A[idy][idx] + B[idy][idx];
}
}
void matrix_add_cpu(int** A_ptr, int** B_ptr, int** C_ptr, int width)
{
g_cpu_calc_count = 0;
for (size_t i = 0; i < width; i++)
{
for (size_t j = 0; j < width; j++)
{
C_ptr[i][j] = A_ptr[i][j] + B_ptr[i][j];
g_cpu_calc_count++;
}
}
}
int main()
{
int *A, **A_ptr, *B, **B_ptr, *C, **C_ptr, **d_A_ptr, **d_B_ptr, **d_C_ptr, *d_A, *d_B, *d_C;
int total_size = Row * Col * sizeof(int);
//在CPU上分配内存
A = (int*)malloc(total_size);
B = (int*)malloc(total_size);
C = (int*)malloc(total_size);
A_ptr = (int**)malloc(Row * sizeof(int*));
B_ptr = (int**)malloc(Row * sizeof(int*));
C_ptr = (int**)malloc(Row * sizeof(int*));
//CPU一维数组初始化
for (size_t i = 0; i < Row * Col; i++)
{
A[i] = 80;
B[i] = 20;
}
for (size_t i = 0; i < Row; i++)
{
A_ptr[i] = A + Col * i;
B_ptr[i] = B + Col * i;
C_ptr[i] = C + Col * i;
}
const clock_t cpu_begin_time_2 = clock(); //开始计时
matrix_add_cpu(A_ptr, B_ptr, C_ptr, Col); //CPU计算
float ms = float(clock() - cpu_begin_time_2);
std::cout << "矩阵加法运算CPU单核运算总次数:" << g_cpu_calc_count << std::endl;
printf("CPU cost_time: %.2f ms \n", ms);
//GPU计算
// set value
for (int i = 0; i < Row * Col; i++)
{
A[i] = 90;
B[i] = 10;
}
// 将主机指针A指向设备数据位置,目的是让设备二级指针能够指向设备数据一级指针
for (size_t i = 0; i < Row; i++)
{
A_ptr[i] = A + Col * i;
B_ptr[i] = B + Col * i;
C_ptr[i] = C + Col * i;
}
//set value
for (int i = 0; i < Row * Col; i++)
{
A[i] = 90;
B[i] = 10;
}
const clock_t gpu_begin_time_2 = clock(); //开始计时
// malloc device memory
cudaMalloc((void**)&d_A_ptr, sizeof(int**) * Row);
cudaMalloc((void**)&d_B_ptr, sizeof(int**) * Row);
cudaMalloc((void**)&d_C_ptr, sizeof(int**) * Row);
cudaMalloc((void**)&d_A, sizeof(int**) * Row*Col);
cudaMalloc((void**)&d_B, sizeof(int**) * Row*Col);
cudaMalloc((void**)&d_C, sizeof(int**) * Row*Col);
//memcpy host to device
cudaMemcpy(d_A_ptr, A_ptr, sizeof(int*)* Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_B_ptr, B_ptr, sizeof(int*)* Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_C_ptr, C_ptr, sizeof(int*)* Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_A, A, sizeof(int)* Row, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, B, sizeof(int)* Row, cudaMemcpyHostToDevice);
dim3 threadPerBlock_2(16, 16); // 定义变量作为kernel的Grid
dim3 blockNumber_2((Col + threadPerBlock_2.x - 1) / threadPerBlock_2.x, (Row + threadPerBlock_2.y - 1) / threadPerBlock_2.y); // 定义变量作为kernel的Block
printf("Block(%d, %d) Grid(%d, %d).\n", threadPerBlock_2.x, threadPerBlock_2.y, blockNumber_2.x, blockNumber_2.y);
addKernel << <blockNumber_2, threadPerBlock_2 >> > (d_C_ptr, d_A_ptr, d_B_ptr);
// memcpy device to host
cudaMemcpy(C_ptr, d_C_ptr, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
ms = float(clock() - gpu_begin_time_2);
std::cout << "矩阵加法运算所有线程数:" << threadPerBlock_2.x * threadPerBlock_2.y * blockNumber_2.x * blockNumber_2.y << std::endl;
std::cout << "矩阵加法运算GPU单线程运算次数:1" << std::endl;
std::cout << "矩阵加法运算GPU拷贝到GPU数据字节数:" << sizeof(int*) * Row * 3 + sizeof(int) * Row * Col * 2 << std::endl;
std::cout << "矩阵加法运算GPU拷贝到CPU数据字节数:" << sizeof(int) * Row * Col << std::endl;
printf("GPU cost_time: %.2f ms \n", ms);
//释放内存
free(A);
free(B);
free(C);
free(A_ptr);
free(B_ptr);
free(C_ptr);
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_A_ptr);
cudaFree(d_B_ptr);
cudaFree(d_C_ptr);
system("pause");
return 0;
}
运行结果:
矩阵加法运算CPU单核运算总次数:1048576
CPU cost_time: 2.00 ms
Block(16, 16) Grid(64, 64).
矩阵加法运算所有线程数:1048576
矩阵加法运算GPU单线程运算次数:1
矩阵加法运算GPU拷贝到GPU数据字节数:8413184
矩阵加法运算GPU拷贝到CPU数据字节数:4194304
GPU cost_time: 439.00 ms
请按任意键继续. . .
2、矩阵乘法
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#define Row 1024
#define Col 1024
long long g_cpu_calc_count;
__global__ void matrix_mul_gpu(int *M, int* N, int* P, int width)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int sum = 0;
for (int k = 0; k<width; k++)
{
int a = M[j*width + k];
int b = N[k*width + i];
sum += a*b;
}
P[j*width + i] = sum;
}
void matrix_mul_cpu(int* M, int* N, int* P, int width)
{
g_cpu_calc_count = 0;
for (int i = 0; i < width; i++) {
for (int j = 0; j<width; j++)
{
int sum = 0;
for (int k = 0; k<width; k++)
{
int a = M[i*width + k];
int b = N[k*width + j];
sum += a*b;
g_cpu_calc_count++;
}
P[i*width + j] = sum;
}
}
}
int main()
{
//malloc host memory
int *A = (int *)malloc(sizeof(int) * Row * Col);
int *B = (int *)malloc(sizeof(int) * Row * Col);
int *C = (int *)malloc(sizeof(int) * Row * Col);
//malloc device memory
int *d_dataA, *d_dataB, *d_dataC;
cudaMalloc((void**)&d_dataA, sizeof(int) *Row*Col);
cudaMalloc((void**)&d_dataB, sizeof(int) *Row*Col);
cudaMalloc((void**)&d_dataC, sizeof(int) *Row*Col);
//set value
for (int i = 0; i < Row*Col; i++) {
A[i] = 90;
B[i] = 10;
}
// CPU计算
const clock_t cpu_begin_time = clock();
matrix_mul_cpu(A, B, C, Col);
float ms = float(clock() - cpu_begin_time);
std::cout << "矩阵乘法运算CPU单核总运算次数:" << g_cpu_calc_count << std::endl;
printf("CPU cost_time: %.2f ms \n", ms);
//GPU计算
//set value
for (int i = 0; i < Row*Col; i++) {
A[i] = 90;
B[i] = 10;
}
const clock_t gpu_begin_time = clock();
//memcpy host to device
cudaMemcpy(d_dataA, A, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
cudaMemcpy(d_dataB, B, sizeof(int) * Row * Col, cudaMemcpyHostToDevice);
dim3 threadPerBlock(16, 16);
dim3 blockNumber((Col + threadPerBlock.x - 1) / threadPerBlock.x, (Row + threadPerBlock.y - 1) / threadPerBlock.y);
printf("Block(%d,%d) Grid(%d,%d).\n", threadPerBlock.x, threadPerBlock.y, blockNumber.x, blockNumber.y);
// gpu start calc
matrix_mul_gpu << <blockNumber, threadPerBlock >> > (d_dataA, d_dataB, d_dataC, Col);
//拷贝数据:GPU->CPU
cudaMemcpy(C, d_dataC, sizeof(int) * Row * Col, cudaMemcpyDeviceToHost);
ms = float(clock() - gpu_begin_time);
std::cout << "矩阵乘法运算所有线程数:" << threadPerBlock.x*threadPerBlock.y * blockNumber.x * blockNumber.y << std::endl;
std::cout << "矩阵乘法运算GPU单线程运算次数:" << Col << std::endl;
std::cout << "矩阵乘法运算CPU拷贝到GPU数据字节数:" << sizeof(int) * Row * Col * 2 << std::endl;
std::cout << "矩阵乘法运算GPU拷贝到CPU数据字节数:" << sizeof(int) * Row * Col << std::endl;
printf("GPU cost_time: %.2f ms \n", ms);
//释放内存
free(A);
free(B);
free(C);
cudaFree(d_dataA);
cudaFree(d_dataB);
cudaFree(d_dataC);
system("pause");
return 0;
}
运行结果:
矩阵乘法运算CPU单核总运算次数:1073741824
CPU cost_time: 1743.00 ms
Block(16,16) Grid(64,64).
矩阵乘法运算所有线程数:1048576
矩阵乘法运算GPU单线程运算次数:1024
矩阵乘法运算CPU拷贝到GPU数据字节数:8388608
矩阵乘法运算GPU拷贝到CPU数据字节数:4194304
GPU cost_time: 10.00 ms
请按任意键继续. . .
结论:CUDA编程调用GPU运算,会增加CPU与GPU传输数据的开销,也就是说使用CUDA编程GPU加速,本身就会出现一部分额外开销;若CPU与GPU交互的数据量一定,则在GPU上执行的计算量越大,则使用GPU加速的效果越明显。因此不可盲目地使用CUDA的GPU加速。