方法一:矢量每一维度的相加都开一个单独线程
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#define N 256 //矢量长度,自行修改
#define BLOCKDIM 128 //线程块内线程数,可自行修改为不超过硬件限制的数
__global__ void addKernel(int *c, const int *a, const int *b)
{
//获得向量维度
int i = blockIdx.x * blockDim.x + threadIdx.x;
//因为矢量每一维度相加是单独一个线程
//所以每个线程只执行一次,if语句即可
if (i < N) {
c[i] = a[i] + b[i];
}
}
int main()
{
int* a = new int[N];
int* b = new int[N];
int* c = new int[N];
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
for (int i = 1; i <= N; ++i) {
a[i-1] = i;
b[i-1] = i & 1;
}
cudaMalloc((void**)&dev_c, N * sizeof(int));
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
//为矢量每一维度相加开一个线程
addKernel<<<(N+BLOCKDIM-1)/BLOCKDIM, BLOCKDIM>>>(dev_c, dev_a, dev_b);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("第%d维:%d+%d=%d\n", i+1, a[i], b[i], c[i]);
}
delete[] a;
delete[] b;
delete[] c;
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return 0;
}
方法二:利用固定的线程数目,每个线程重复利用计算若干矢量维度的相加
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define N 256 //向量长度,自行设置
#define BLOCK 128//线程块数目,可根据硬件限制随意设置
#define BLOCKDIM 128 //线程块内线程数目,可根据硬件限制随意设置
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
while (i<N)
{
c[i] = a[i] + b[i];
//GPU一次同时运行的线程数目,是一个grid里面横着的一排
//所以一次同时计算的矢量维度序号范围是grid里一横排包括的序号范围
//所以对于每个重复利用的线程,下次计算的维度序号和这次相差一横排的宽度
//一个grid里包含若干block
//一个block里包含若干thread
//blockDim是一个线程格在横排上的宽度,即横排上分布多少个thread
//gridDim是一个grid在横排上的宽度,即横排上分布多少个block
i += blockDim.x * gridDim.x;
}
}
int main()
{
int* a = new int[N];
int* b = new int[N];
int* c = new int[N];
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
for (int i = 1; i <= N; ++i) {
a[i-1] = i;
b[i-1] = i & 1;
}
cudaMalloc((void**)&dev_c, N * sizeof(int));
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevice);
//固定数目线程重复利用来计算矢量各个维度相加
addKernel<<<BLOCK, BLOCKDIM>>>(dev_c, dev_a, dev_b);
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int i = 0; i < N; ++i) {
printf("第%d维:%d+%d=%d\n", i+1, a[i], b[i], c[i]);
}
delete[] a;
delete[] b;
delete[] c;
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return 0;
}