转自:https://blog.csdn.net/sunmc1204953974/article/details/51025801
多线程加速
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include <time.h>
#define DATA_SIZE 1048576// = 1024*1024(1M)
int data[DATA_SIZE];
int clockRate;
#define THREAD_NUM 256
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);//按线程计算的warp块大小
printf("memPitch : %d.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %d.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);//以千赫为单位的时钟频率;
clockRate = prop.clockRate;
printf("textureAlignment : %d.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);//设备上多处理器的数量。
}
void GenerateNumbers(int *number, int size) {
for (int i = 0; i < size; i++) {
number[i] = rand() % 10;
}
}
bool init_cuda() {
int count;
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for ( i = 0; i < count; i++)
{
cudaDeviceProp prop; //CUDA device properties
cudaGetDeviceProperties(&prop, i);
printDeviceProp(prop);
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1)//prop.major 和 prop.minor 分别代表装置支持的版本号码,例如 6.5 则 prop.major 为 6 而prop.minor 为 5
break;//确保版本号大于1
}
}
if (i == count) {
fprintf(stderr, "There is no device support CUDA 1.x");
return false;
}
cudaSetDevice(i);
return true;
}
/*
在 CUDA 中,在函数前面加上__global__ 表示这个函式是要在显示芯片上执行的,
所以我们只要在正常函数之前加上一个__global__就行了
【注意】显示芯片上执行的程序,不能有返回值!
*/
__global__ static void sumofSquares(int *num, int *result,clock_t* time) {
//表示当前的thread是第几个thread(从0开始)
const int tid = threadIdx.x;
//计算每个线程需要完成的量
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
clock_t start;
if(tid == 0)
start= clock();
//for (int i = 0; i < DATA_SIZE; i++)
for (int i = tid*size; i < (tid+1)*size; i++) {
sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
if(tid == 0)
*time = clock() - start;//clock()表示多少个时钟周期----
//要计算多少秒就要乘以1个时钟周期的大小(即除以显卡时钟频率--prop.clockRate)
}
int main() {
if (!init_cuda()) {
fprintf(stderr, "Cuda init failed!!");
return 0;
}
printf("CUDA initialized.\n");
//生成随机数
GenerateNumbers(data, DATA_SIZE);
//把数据复制到显卡内存
int *gpudata, *result;
clock_t* time;
//在显卡中分配一块大小为 sizeof(int)*DATA_SIZE 的内存
cudaMalloc((void**)&gpudata, sizeof(int)*DATA_SIZE);//&gpudata代表指针gpudata的地址----(void**)是类型转换
cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)*DATA_SIZE, cudaMemcpyHostToDevice);
// dst , src----把CPU中的data 传到 GPU中的gpudata
// 在CUDA 中执行函数 语法:函数名称<<<block 数目, thread 数目, shared memory 大小>>>(参数...);
sumofSquares <<<1,THREAD_NUM, 0 >>> (gpudata, result,time);
int sum[THREAD_NUM];
clock_t time_used;
cudaMemcpy(&sum, result, sizeof(int)*THREAD_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
//Free
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int total = 0;
for (int i = 0; i < THREAD_NUM; i++) {
total += sum[i];
}
printf("GpuSum = %d 时钟周期个数 = %d time = %lf \n", total,time_used,(double)time_used/(clockRate*1000));
//计算用到的显存带宽--传输数据块大小:1MB*4B(int大小4B)
//----带宽为:4MB/time---计算得在release模式下
//time = (double)time_used/(clockRate*1000) = 0.015351,带宽为4MB/0.015351 = 261MB/s
int cpusum = 0;
for (int i = 0; i < DATA_SIZE; i++) {
cpusum += data[i] * data[i] * data[i];
}
printf("CpuSum = %d\n", cpusum);
getchar();
return 0;
}
Debug模式下结果:
Release模式下结果:
增加线程–>1024
Release模式下:
带宽没怎么提高,反而下降了
除了Thread还讲过两个概念,就是Grid和Block,当然另外还有共享内存,这些东西可不会没有他们存在的意义,我们进一步并行加速就要通过他们。另外之前也提到了很多优化步骤,每个步骤中都有大量的优化手段,所以我们仅仅用了线程并行这一个手段,显然不可能一蹴而就。
连续存取加速
转自:https://blog.csdn.net/sunmc1204953974/article/details/51064302
__global__ static void sumofSquares(int *num, int *result,clock_t* time) {
//表示当前的thread是第几个thread(从0开始)
const int tid = threadIdx.x;
//计算每个线程需要完成的量
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
clock_t start;
if(tid == 0)
start= clock();
//for (int i = 0; i < DATA_SIZE; i++)
for (int i = tid; i < DATA_SIZE; i=i+THREAD_NUM) {
sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
if(tid == 0)
*time = clock() - start;//clock()表示多少个时钟周期----
//要计算多少秒就要乘以1个时钟周期的大小(即除以显卡时钟频率--prop.clockRate)
}
完整程序:
#include <stdio.h>
#include <cuda_runtime.h>
#include <math.h>
#include <time.h>
#define DATA_SIZE 1048576// = 1024*1024(1M)
int data[DATA_SIZE];
int clockRate;
#define THREAD_NUM 1024
void printDeviceProp(const cudaDeviceProp &prop)
{
printf("Device Name : %s.\n", prop.name);
printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
printf("regsPerBlock : %d.\n", prop.regsPerBlock);
printf("warpSize : %d.\n", prop.warpSize);//按线程计算的warp块大小
printf("memPitch : %d.\n", prop.memPitch);
printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
printf("totalConstMem : %d.\n", prop.totalConstMem);
printf("major.minor : %d.%d.\n", prop.major, prop.minor);
printf("clockRate : %d.\n", prop.clockRate);//以千赫为单位的时钟频率;
clockRate = prop.clockRate;
printf("textureAlignment : %d.\n", prop.textureAlignment);
printf("deviceOverlap : %d.\n", prop.deviceOverlap);
printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);//设备上多处理器的数量。
}
void GenerateNumbers(int *number, int size) {
for (int i = 0; i < size; i++) {
number[i] = rand() % 10;
}
}
bool init_cuda() {
int count;
cudaGetDeviceCount(&count);
if (count == 0) {
fprintf(stderr, "There is no device.\n");
return false;
}
int i;
for ( i = 0; i < count; i++)
{
cudaDeviceProp prop; //CUDA device properties
cudaGetDeviceProperties(&prop, i);
printDeviceProp(prop);
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 1)//prop.major 和 prop.minor 分别代表装置支持的版本号码,例如 6.5 则 prop.major 为 6 而prop.minor 为 5
break;//确保版本号大于1
}
}
if (i == count) {
fprintf(stderr, "There is no device support CUDA 1.x");
return false;
}
cudaSetDevice(i);
return true;
}
/*
在 CUDA 中,在函数前面加上__global__ 表示这个函式是要在显示芯片上执行的,
所以我们只要在正常函数之前加上一个__global__就行了
【注意】显示芯片上执行的程序,不能有返回值!
*/
__global__ static void sumofSquares(int *num, int *result,clock_t* time) {
//表示当前的thread是第几个thread(从0开始)
const int tid = threadIdx.x;
//计算每个线程需要完成的量
const int size = DATA_SIZE / THREAD_NUM;
int sum = 0;
clock_t start;
if(tid == 0)
start= clock();
//for (int i = 0; i < DATA_SIZE; i++)
for (int i = tid; i < DATA_SIZE; i=i+THREAD_NUM) {
sum += num[i] * num[i] * num[i];
}
result[tid] = sum;
if(tid == 0)
*time = clock() - start;//clock()表示多少个时钟周期----
//要计算多少秒就要乘以1个时钟周期的大小(即除以显卡时钟频率--prop.clockRate)
}
int main() {
if (!init_cuda()) {
fprintf(stderr, "Cuda init failed!!");
return 0;
}
printf("CUDA initialized.\n");
//生成随机数
GenerateNumbers(data, DATA_SIZE);
//把数据复制到显卡内存
int *gpudata, *result;
clock_t* time;
//在显卡中分配一块大小为 sizeof(int)*DATA_SIZE 的内存
cudaMalloc((void**)&gpudata, sizeof(int)*DATA_SIZE);//&gpudata代表指针gpudata的地址----(void**)是类型转换
cudaMalloc((void**)&result, sizeof(int)*THREAD_NUM);
cudaMalloc((void**)&time, sizeof(clock_t));
cudaMemcpy(gpudata, data, sizeof(int)*DATA_SIZE, cudaMemcpyHostToDevice);
// dst , src----把CPU中的data 传到 GPU中的gpudata
// 在CUDA 中执行函数 语法:函数名称<<<block 数目, thread 数目, shared memory 大小>>>(参数...);
sumofSquares <<<1,THREAD_NUM, 0 >>> (gpudata, result,time);
int sum[THREAD_NUM];
clock_t time_used;
cudaMemcpy(&sum, result, sizeof(int)*THREAD_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t), cudaMemcpyDeviceToHost);
//Free
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
int total = 0;
int size = DATA_SIZE / THREAD_NUM;
for (int i = 0; i <THREAD_NUM; i++) {
total += sum[i];
}
double gpuTime = (double)time_used / (clockRate * 1000);
printf("GpuSum = %d 线程数 = %d 时钟周期个数 = %d time = %.10lf 带宽 = %lfMB/s\n", total, THREAD_NUM,time_used,gpuTime,4.0/gpuTime);
//计算用到的显存带宽--传输数据块大小:1MB*4B(int大小4B)
//----带宽为:4MB/time---计算得在release模式下
//time = (double)time_used/(clockRate*1000) = 0.015351,带宽为4MB/0.015351 = 261MB/s
int cpusum = 0;
clock_t startcpu = clock();
for (int i = 0; i < DATA_SIZE; i++) {
cpusum += data[i] * data[i] * data[i];
}
clock_t cpu_time = clock() - startcpu;
printf("CpuSum = %d cpu时钟周期个数 = %d\n", cpusum,cpu_time);
getchar();
return 0;
}
Realease模式下,带宽明显提高!!