一、GPU任意长度矢量求和
跨网格循环:数据集大小比网格grid中线程数量多
1.1 一维网格和线程块-----输入为一维数组
假设网格和线程块均为一维组织结构,在跨网格循环中,每个线程将在网格内使用 threadIdx + blockIdx*blockDim 计算自身唯一的索引,并对数组内该索引的元素执行相应运算,然后将网格中的线程总数添加到索引并重复此操作,直至超出数组范围。
1.1.1 核函数代码
__global__ void add(int *a, int *b, int *c)
{
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for(int i = index; i < N; i += stride)
{
c[i] = a[i] + b[i];
}
}
1.1.2 主线程代码内存分配初始化
#define N (2048*2048)
#define blocksize 256
#define gridsize 512
int main(){
int *a, *b, *c;
int *d_a, *d_b, *d_c;
int size = N * sizeof(int);
//分配GPU显存空间
cudaMalloc((void**)&d_a, size);
cudaMalloc((void**)&d_b, size);
cudaMalloc((void**)&d_c, size);
//为输入数据分配内存空间
a = (int*)malloc(size);
b = (int*)malloc(size);
c = (int*)malloc(size);
//初始化
for (i = 0; i < N; i++) {
a[i] = 1; b[i] = 2;
}
//将内存中的输入数据拷贝到显存
cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, size, cudaMemcpyHostToDevice);
//调用并启动kernel
add<<<gridsize, blocksize>>>(d_a,d_b,d_c);
//将显存中的计算结果回读到内存
cudaMemcpy(c, d_c, size, cudaMemcpyDeviceToHost);
//释放内存
free(a);
free(b);
free(c);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
1.2 二维网格和线程块-----输入数据为一维数组
1.2.1 核函数
const int N = 1024;
const in blocksize = 16;
__global__ void add_matrix(float* a, float* b, float* c, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + j * N;
int stride = blockDim.x*blockDim.y*gridDim.x*gridDim.y;
for(int idx = index,idx < N*N; idx += stride)
c[idx] = a[idx] + b[idx];
}
二、矩阵并行相加
2.1 输入为一维矢量
2.1.1 核函数
const int N = 1024;
const in blocksize = 16;
__global__ void add_matrix(float* a, float* b, float* c, int N)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
int index = i + j * N;
if(i < N && j < N)
c[index] = a[index] + b[index];
}
2.1.2 主线程代码
int main()
{ //分配主机内存
float *a = new float[N*N];
float *b = new float[N*N];
float *c = new float[N*N];
//初始化
for(int i = 0; i < N * N; i++)
{
a[i] = 1.0f;
b[i] = 3.5f;
}
float *ad,*bd,*cd;
const int size = N*N*sizeof(float);
//分配设备内存
cudaMalloc((void**)&ad, size);
cudaMalloc((void**)&bd, size);
cudaMalloc((void**)&cd, size);
cudaMemcpy(ad, a, size, cudaMemcpyHostToDevice);
cudaMemcpy(bd, b, size, cudaMemcpyHostToDevice);
//执行Kernel
dim3 dimBlock(blocksize, blocksize);
dim3 dimGrid(N/dimBlock.x, N/dimBlock.y);
add_matrix<<<dimGrid, dimBlock>>>(ad, bd, cd);
cudaMemcpy(c, cd, size, cudaMemcpyDeviceToHost);
//释放内存
cudaFree(ad);
cudaFree(bd);
cudaFree(cd);
delete []a;
delete []b;
delete []c;
return 0;
}
2.2 二维矩阵(元素个数未超过线程总数)-----输入为二维数组
2.2.1核函数修改
__global__ void MatAdd(float A[][], float B[][], float C[][])
{
int i = blockDim.x * blockIdx.x + threadIdx.x; //列
int j = blockDim.y * blockIdx.y + threadIdx.y; //行
if (i < N&&j<N)
C[i][j] = A[i][j] + B[i][j];
}
2.2.2 主函数
最重要部分修改,grid的维度需要加dimBlock-1再除以dimBlock
int main() {
dim3 dimBlock(16 16);
dim3 dimGrid ((N + dimBlock.x-1)/ dimBlock.x,
(N + dimBlock.y-1)/ dimBlock.y);
MatAdd<<<dimGrid, dimBlock>>>(A, B, C); }
三、规约运算
3.1 全局内存规约
3.1.1 一个线程完成规约
核函数
__global__ void sum(int *num, int* result)
{
int sum = 0;int i;
for(i = 0; i < DATA_SIZE; i++)
{
sum += num[i] * num[i];
}
*result = sum;
}
主函数
#define DATA_SIZE 4096*256
int main()
{
int *data ;
data = new int[DATA_SIZE];
//data = (int*)malloc(DATA_SIZE*sizeof(int));
int* gpudata, *result;
cudaMalloc((void**) &gpudata, sizeof(int) * DATA_SIZE);
cudaMalloc((void**) &result, sizeof(int));
cudaMemcpy(gpudata, data, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice);
//执行核函数并赋值结果回主机
sum<<<1, 1>>>(gpudata, result);
int h_sum ;
cudaMemcpy(& h_sum, result, sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(gpudata);
cudaFree(result);
printf("sum: %d ", h_sum);
return 0;
}
3.2 分组规约
把 4096*256个数分成256组,每组有4096个数,然后利用256个线程,每个线程处理一组数据。
核函数
#define DATA_SIZE 4096*256
#define THREAD_NUM 256
__global__ void sum(int *num, int* result)
{
const int tid = threadIdx.x;
// 线程数为256
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
int i;
for(i = tid * size; i < (tid + 1) * size; i++)
{
sum += num[i] ;
}
result[tid] = sum;
}
主函数改为
int* gpudata, *result;
cudaMalloc((void**) &gpudata, sizeof(int) * DATA_SIZE); cudaMalloc((void**) &result, sizeof(int) *THREAD_NUM);
cudaMemcpy(gpudata, data, sizeof(int) * DATA_SIZE, cudaMemcpyHostToDevice);
sum<<<1, THREAD_NUM>>>(gpudata, result);
int h_sum[THREAD_NUM];
cudaMemcpy(h_sum, result, sizeof(int) *256, cudaMemcpyDeviceToHost);
int final_sum = 0;
for(int i = 0; i < THREAD_NUM; i++)
{
final_sum += h_sum[i];
}
printf("sum: %d ", final_sum);
cudaFree(gpudata);
cudaFree(result);
3.3 合并规约
利用256线程对所有数据进行处理,让 thread 0 读取第一个数字,thread 1 读取第二个数字…依此类推
#define DATA_SIZE 4096*256
#define THREAD_NUM 256
__global__ void sum(int *num, int* result)
{
const int tid = threadIdx.x
// 线程数是256
int sum = 0;
int i;
for(i = tid; i < DATA_SIZE; i += THREAD_NUM)
{
sum += num[i] ;
}
result[tid] = sum;
}
3.2 共享内存规约
3.2.1 一个线程规约
一个 block 内的 thread可以有共享的内存,也可以进行同步。我们可以利用这一点,让每个 block内的所有 thread把自己计算的结果加总起来。把 kernel 改成如下:
__global__ void sum(int *num, int* result)
{
__shared__ int shared[256];
int tid= blockIdx.x*blockDim.x+threadIdx.x;
int i;
int cacheIndex=threadIdx.x;
shared[cacheIndex] = 0;
for(i =tid; i < DATA_SIZE; i += BLOCK_NUM * THREAD_NUM)
shared[cacheIndex] += num[i] ;
__syncthreads();
if(cacheIndex == 0)
{
for(i = 1; i < THREAD_NUM; i++)
{
shared[0] += shared[i];
}
result[blockIdx.x] = shared[0];
}
}
3.2.2 并行规约—偶数读取
只由每个 block 的 thread 0 来进行,但这并不是最有效率的方法。 把 256 个数字加总的动作,是可以并行化的。
__global__ void reduce(int* g_idata, int* g_odata)
{
extern __share__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s = 1; s < blockDim.x; s *= 2)
{
int index = 2 * s *tid;
if(index < blockDim.x)
sdata[index] += sdata[index + s];
__syncthreads();
}
if(tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
3.2.3 并行规约–并行读取
__global__ void reduce(int* g_idata, int* g_odata)
{
extern __share__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = g_idata[i];
__syncthreads();
for(unsigned int s = blockDim.x/2; s > 0; s /= 2)
{
if(tid < s)
{
sdata[tid] += sdata[tid+s];
}
__syncthreads();
}
if(tid == 0)
g_odata[blockIdx.x] = sdata[0];
}
四、并行内积运算
4.1 核函数
#define N 4096
#define THREAD_NUM 256
#define BLOCK_NUM 16
__global__ void dot(int *a, int *b, int *c)
{
__shared__ int shared[256];
int tid= blockIdx.x*blockDim.x+threadIdx.x;
int cacheIndex=threadIdx.x; int temp = 0;
while (tid<N)
{
temp+= a[tid]*b[tid];
tid += blockDim.x*gridDim.x;
}
shared[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i!=0)
{
if (cacheIndex<i)
shared[cacheIndex] += shared [cacheIndex+i];
__syncthreads();
i = i/2;
}
if(cacheIndex == 0) c[blockIdx.x] = shared[0];
}
4.2 主函数
int main(){
int* a, *b, *partial_c;
int* dev_a, *dev_b, *dev_partial_c;
a = (int *)malloc(N*sizeof(int) );
b = (int *)malloc(N*sizeof(int) );
partial_c = (int *)malloc( BLOCK_NUM*sizeof(int) ); cudaMalloc((void**) &dev_a, sizeof(int) * N);
cudaMalloc((void**) &dev_b, sizeof(int) * N);
cudaMalloc((void**) &dev_partial_c, sizeof(int) * BLOCK_NUM); for(int i=0;i<N;i++){
a[i]=i;
b[i]=i*2;
}
cudaMemcpy(dev_a, a, sizeof(int) * N, cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, sizeof(int) * N, cudaMemcpyHostToDevice);
dot<<<BLOCK_NUM,THREAD_NUM>>>(dev_a, dev_b, dev_partial_c);
cudaMemcpy(partial_c, dev_partial_c, sizeof(int) * BLOCK_NUM, cudaMemcpyDeviceToHost);
int final_sum = 0;
for(int i = 0; i < BLOCK_NUM; i++) final_sum += partial_c[i];
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_partial_c);
free(a);
free(b);
free(partial_c);
}
五、并行矩阵转置
5.1 核函数
__global__ void transpose(float *odata, float *idata, int width, int height) {
__shared__ float block[BLOCK_DIM][BLOCK_DIM]; //静态分配 sharedMem
//把矩阵块以合并访问读入共享存储器
int i = blockIdx.x * BLOCK_DIM + threadIdx.x;
int j = blockIdx.y * BLOCK_DIM + threadIdx.y;
if((i < width) && (j < height)) { //保证内存访问不会超过边界
block[threadIdx.y][threadIdx.x] = idata[i+j*width]; // i随着threadIdx.x递增,
// 保证合并访问
}
__syncthreads();
//将转置后的矩阵以合并访问写回全局存储器
i = blockIdx.y * BLOCK_DIM + threadIdx.x;
j = blockIdx.x * BLOCK_DIM + threadIdx.y;
if((i < height) && (j< width)) {
odata[i+ j* height] = block[threadIdx.x][threadIdx.y];
// i随着threadIdx.x递增,保证合并访问
}
}