#include<iostream>
#include<time.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace std;
#define threadSize 100
#define arrSize 100
void addWithCPU(float (*array)[arrSize]);
void addWithGPU(float (*array)[arrSize]);
__global__ void addKernel(float (*p)[arrSize]);
//共threadSize组,每一组有arrSize个数字,组内进行二重循环计算
//对GPU而言,共分配threadSize个线程,每个线程只需计算一组数据。
//时间复杂度为O(threadSize * arrSize^2)
int main()
{
//Used in CPU
float array1[threadSize][arrSize];
//Used in GPU
float array2[threadSize][arrSize];
//Initialize array
for(int i = 0; i < threadSize; i++)
{
for(int j = 0; j < arrSize; j++)
{
array1[i][j] = array2[i][j] = 1;
}
}
time_t startCPU, endCPU, startGPU, endGPU;
startCPU = clock();
addWithCPU(array1);
endCPU = clock();
startGPU = clock();
addWithGPU(array2);
endGPU = clock();
cout << "Time used for CPU is: " << endCPU - startCPU << endl;
cout << "Time used for GPU is: " << endGPU - startGPU << endl;
return 1;
}
//CPU computation
void addWithCPU(float (*array)[arrSize])
{
for(int i = 0; i < threadSize; i++)
{
for(int j = 0; j < arrSize; j++)
{
for(int k = 0; k < arrSize; k++)
{
array[i][j] += array[i][k];
}
}
}
}
//GPU kernel
__global__ void addKernel(float (*p)[arrSize])
{
int i = threadIdx.x;
for(int j = 0; j < arrSize; j++)
{
for(int k = 0; k < arrSize; k++)
{
p[i][j] += p[i][k];
}
}
}
//GPU computation
void addWithGPU(float (*array)[arrSize])
{
float (*p)[arrSize] = nullptr;
cudaSetDevice(0);
cudaMalloc((void**)&p, threadSize * arrSize * sizeof(float));
cudaMemcpy(p, array, threadSize * arrSize * sizeof(float), cudaMemcpyHostToDevice);
addKernel<<<1, arrSize>>>(p);
cudaMemcpy(array, p, threadSize * arrSize * sizeof(float), cudaMemcpyDeviceToHost);
}
-
threadSize = 1 , arrSize = 1
-
threadSize = 1 , arrSize = 1000
-
threadSize = 10 , arrSize = 1000
-
threadSize = 100 , arrSize = 100
-
threadSize = 100 , arrSize = 500
-
threadSize = 100 , arrSize = 1000
-
threadSize = 1000 , arrSize = 100
-
threadSize = 1000 , arrSize = 500
-
threadSize = 1000 , arrSize = 1000
-
threadSize = 10000 , arrSize = 100
通过上述时间比对,可以看出GPU的基本处理时间大约在2.8秒左右,这包括设备启动时间,设备关闭时间。随着数据量的增加,CPU和GPU的处理时间差逐渐缩小,可以看出GPU的多线程在处理海量数据时存在优势。