利用GPU并行计算的的总体思路是:在CPU(Host)中创建数据,将数据传到GPU(Device)中进行计算,再将计算结果传回到CPU中。
最简单的例子:将CPU中的两个数字在GPU中进行相加,并在CPU中输出:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
__device__ int add_gpu(int a, int b) {
return a + b;
}
__global__ void add(int a, int b, int *c) {
*c = add_gpu(a , b);
}
int main()
{
//创建变量
int a = 3, b = 5;
int c,int *ptr;
//分配GPU中的内存
cudaMalloc((void **)&ptr, sizeof(int));
//在GPU中进行计算
add << <1, 1 >> > (a, b, ptr);
//将GPU中的计算结果(ptr指针)复制到CPU主机中,赋给c
cudaMemcpy(&c, ptr, sizeof(int), cudaMemcpyDeviceToHost);
printf("%d + %d = %d\n", a, b, c);
//释放指针
cudaFree(ptr);
return 0;
}
//3 + 5 = 8
上面的例子只有一个数相加,因此不需要多线程,下面我们加大数据量,引入多线程并行计算。
用10个线程对两个长度为10的数组相加:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
const int N = 10; //数组长度
const int Mem = N * sizeof(int); //数组内存大小
__global__ void add(int *a, int *b, int *c) {
int tid = threadIdx.x;
if (tid < N)
c[tid] = a[tid] + b[tid];
}
int main()
{
//创建变量
int a[N] = { 1,2,3,4,5,6,7,8,9,10 };
int b[N] = { 1,3,5,7,9,11,13,15,17,19 };
int c[N];
int *dev_a, *dev_b, *dev_c;
//分类GPU内存
cudaMalloc((void **)&dev_a, Mem);
cudaMalloc((void **)&dev_b, Mem);
cudaMalloc((void **)&dev_c, Mem);
//将数据传给GPU
cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice);
//在GPU中并行计算
add << <1, 10 >> > (dev_a, dev_b, dev_c);
//将计算结果传回CPU
cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost);
//输出计算结果
for (int i = 0; i < N; i++)
printf("%d + %d = %d\n", a[i], b[i], c[i]);
//释放指针
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
/*
1 + 1 = 2
2 + 3 = 5
3 + 5 = 8
4 + 7 = 11
5 + 9 = 14
6 + 11 = 17
7 + 13 = 20
8 + 15 = 23
9 + 17 = 26
10 + 19 = 29
*/
当然也可以用10个线程块,每个线程块分配一个线程的方式来实现,输出结果是一样的,代码如下:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#define N 1024
const int Mem = N * sizeof(int); //数组内存大小
__global__ void add(int *a, int *b, int *c) {
int bid = blockIdx.x;
if (bid < N)
c[bid] = a[bid] + b[bid];
}
int main()
{
//创建变量
int a[N] = { 1,2,3,4,5,6,7,8,9,10 };
int b[N] = { 1,3,5,7,9,11,13,15,17,19 };
int c[N];
int *dev_a, *dev_b, *dev_c;
//分类GPU内存
cudaMalloc((void **)&dev_a, Mem);
cudaMalloc((void **)&dev_b, Mem);
cudaMalloc((void **)&dev_c, Mem);
//将数据传给GPU
cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice);
//在GPU中并行计算
add << <10, 1 >> > (dev_a, dev_b, dev_c);
//将计算结果传回CPU
cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost);
//输出计算结果
for (int i = 0; i < N; i++)
printf("%d + %d = %d\n", a[i], b[i], c[i]);
//释放指针
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
实际情况中,数据量很大,我们不可能用一个线程只处理一个数据,而是用一个线程处理多个数据。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <iostream>
#define N 1024
const int Mem = N * sizeof(int); //数组内存大小
__global__ void add(int *a, int *b, int *c) {
int bid = blockIdx.x;
while (bid < N) {
c[bid] = a[bid] + b[bid];
bid += gridDim.x;
}
}
int main()
{
//创建变量
int a[N],b[N],c[N];
int *dev_a, *dev_b, *dev_c;
for (int i = 0; i < N; i++) {
a[i] = i + 1;
b[i] = 2 * i;
}
//分类GPU内存
cudaMalloc((void **)&dev_a, Mem);
cudaMalloc((void **)&dev_b, Mem);
cudaMalloc((void **)&dev_c, Mem);
//将数据传给GPU
cudaMemcpy(dev_a, a, Mem, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, Mem, cudaMemcpyHostToDevice);
//在GPU中并行计算
add << <10, 1 >> > (dev_a, dev_b, dev_c);
//将计算结果传回CPU
cudaMemcpy(c, dev_c, Mem, cudaMemcpyDeviceToHost);
//输出计算结果
for (int i = 0; i < N; i++)
printf("%d + %d = %d\n", a[i], b[i], c[i]);
//释放指针
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}