仅作个人记录
参考:CUDA编程(八)树状加法_MingChao_Sun-CSDN博客
顺便说一句,这位博主的cuda系列写的很清晰,关于环境配置,也建议参考这位博主,简单直接就行。
过程中碰到问题,还是建议查看官方API文档
CUDA Runtime API :: CUDA Toolkit Documentation
上一篇介绍了ShareMemory和Thread同步,最后利用这些知识完成了block内部线程结果的加和,减轻了CPU的负担,但是block的加和工作是使用一个thread0单线程完成的,这点有待改进。
这个单线程的加法部分如何解决?cuda程序只有并行才能发挥其优势,这个加法能不能并行呢?答案是可行的,可以利用树状加法的方式将加法并行。
具体实现可以看原博客:CUDA编程(八)树状加法_MingChao_Sun-CSDN博客
相比上一版的程序,只用改动核函数里面的加和部分就OK了,下面是改好的核函数:
//__global__函数(GPU上执行),计算立方和
__global__ void sum_Squares(int *num, int *result, clock_t *time){
extern __shared__ int shared[];//声明一块共享内存
const int thread_id = threadIdx.x;//当前的线程编号(0开始)
const int block_id = blockIdx.x;//当前的 thread 属于第几个 block(0 开始)
shared[thread_id] = 0;
clock_t start;
if (thread_id == 0) time[block_id] = clock();//计算时间,只在 threadid ==0 时进行,每个 block 都会记录开始时间及结束时间
for (int i =block_id*THREAD_NUM + thread_id; i < DATA_SIZE; i += BLOCK_NUM*THREAD_NUM) shared[thread_id] += num[i] * num[i] * num[i];
__syncthreads();//同步 保证每个 thread 都已经把结果写到 shared[tid] 里面
//树状加法
int offset = 1, mask = 1;
while (offset < THREAD_NUM)
{
if ((thread_id & mask) == 0) shared[thread_id] += shared[thread_id + offset];
offset += offset;
mask = offset + mask;
__syncthreads();//同步
}
if (thread_id == 0){
result[block_id] = shared[0];
time[block_id + BLOCK_NUM] = clock();
}
}
总体代码:
#include <iostream>
#include <stdlib.h>
#include <time.h> //用于计时
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
using namespace std;
//32 个 block,每个 block 有 256个 thread,共有 32*256= 8192个thread
#define BLOCK_NUM 32
#define THREAD_NUM 256
#define DATA_SIZE 1048576
int data[DATA_SIZE];
//产生随机数
void generateNum(int *data, int size){
for (int i = 0; i < size; ++i) data[i] = rand() % 10;
}
void printDeviceProp(const cudaDeviceProp &prop){
cout << "Device Name: " << prop.name << endl;
cout << "totalGlobalMem: " << prop.totalGlobalMem << endl;
cout << "sharedMemPerBlock: " << prop.sharedMemPerBlock << endl;
cout << "regsPerBlock: " << prop.regsPerBlock << endl;
cout << "warpSize: " << prop.warpSize << endl;
cout << "memPitch: " << prop.memPitch << endl;
cout << "maxThreadsPerBlock:" << prop.maxThreadsPerBlock << endl;
cout << "maxThreadsDim[0 - 2]: " << prop.maxThreadsDim[0] << " " << prop.maxThreadsDim[1] << " " << prop.maxThreadsDim[2] << endl;
cout << "maxGridSize[0 - 2]: " << prop.maxGridSize[0] << " " << prop.maxGridSize[1] << " " << prop.maxGridSize[2] << endl;
cout << "totalConstMem:" << prop.totalConstMem << endl;
cout << "major.minor:" << prop.major << " " << prop.minor << endl;
cout << "clockRate:" << prop.clockRate << endl;
cout << "textureAlignment:" << prop.textureAlignment << endl;
cout << "deviceOverlap:" << prop.deviceOverlap << endl;
cout << "multiProcessorCount:" << prop.multiProcessorCount << endl;
}
//cuda初始化
bool InitCuda(){
int count;
cudaGetDeviceCount(&count);//获取能够使用的gpu数量,编号从0开始
if (count == 0) return false;//没有支持cuda的gpu
int device = 0;
for (; device < count; ++device){
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, device) == cudaSuccess){
printDeviceProp(prop);
break;//寻找一个可用的gpu
}
}
cudaSetDevice(device);//决定使用编号为device的gpu
return true;
}
//__global__函数(GPU上执行),计算立方和
__global__ void sum_Squares(int *num, int *result, clock_t *time){
extern __shared__ int shared[];//声明一块共享内存
const int thread_id = threadIdx.x;//当前的线程编号(0开始)
const int block_id = blockIdx.x;//当前的 thread 属于第几个 block(0 开始)
shared[thread_id] = 0;
clock_t start;
if (thread_id == 0) time[block_id] = clock();//计算时间,只在 threadid ==0 时进行,每个 block 都会记录开始时间及结束时间
for (int i =block_id*THREAD_NUM + thread_id; i < DATA_SIZE; i += BLOCK_NUM*THREAD_NUM) shared[thread_id] += num[i] * num[i] * num[i];
__syncthreads();//同步 保证每个 thread 都已经把结果写到 shared[tid] 里面
//树状加法
int offset = 1, mask = 1;
while (offset < THREAD_NUM)
{
if ((thread_id & mask) == 0) shared[thread_id] += shared[thread_id + offset];
offset += offset;
mask = offset + mask;
__syncthreads();//同步
}
if (thread_id == 0){
result[block_id] = shared[0];
time[block_id + BLOCK_NUM] = clock();
}
}
int main(){
if (!InitCuda()) return 0;
//生成随机数
generateNum(data, DATA_SIZE);
int *gpudata, *result;
clock_t *time;
//gpu上开内存空间存储数组以及计算结果
cudaMalloc((void **)&gpudata, sizeof(int)*DATA_SIZE);//第一个参数是指针的指针
cudaMalloc((void **)&result, sizeof(int)*BLOCK_NUM);//thread,block增多
cudaMalloc((void **)&time, sizeof(clock_t)*BLOCK_NUM*2);
//数据从cpu搬运到gpu
cudaMemcpy(gpudata, data, sizeof(int)*DATA_SIZE, cudaMemcpyHostToDevice);
//CUDA 中执行函数 语法:函数名称<<<block数目, thread数目, shared memory大小>>>(args...)
sum_Squares <<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(int)>>>(gpudata, result, time);//512个线程进行运算
//运算结果又从gpu搬运回cpu
int sum[BLOCK_NUM];//进行修改
clock_t time_cost[BLOCK_NUM*2];
cudaMemcpy(&sum, result, sizeof(int)*BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_cost, time, sizeof(clock_t)*BLOCK_NUM * 2, cudaMemcpyDeviceToHost);
//释放gpu上面开的内存
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int all_sum = 0;//cpu端进行加和
for (int i = 0; i < BLOCK_NUM; ++i) all_sum += sum[i];
//新的计时策略 把每个 block 最早开始时间和最晚结束时间之差,为总运行时间
clock_t min_start = time_cost[0], max_end = time_cost[BLOCK_NUM];
for (int i = 1; i < BLOCK_NUM; i++) {
if (min_start > time_cost[i])
min_start = time_cost[i];
if (max_end < time_cost[i + BLOCK_NUM])
max_end = time_cost[i + BLOCK_NUM];
}
cout << "GPU_sum: " << all_sum << " time cost: " << max_end - min_start << endl;
all_sum = 0;//cpu上面也计算一次进行验证
for (int i = 0; i < DATA_SIZE; ++i) all_sum += data[i] * data[i] * data[i];
cout << "CPU_sum: " << all_sum << endl;
return 0;
}
运行结果:
可以看到确实比只使用每个block的thread0进行加和要快