将GPU多线程的计算能力与CPU进行比较

最新推荐文章于 2024-03-23 22:35:43 发布

MusicLEEE

最新推荐文章于 2024-03-23 22:35:43 发布

阅读量1.5k

点赞数 1

分类专栏： CUDA 文章标签：大数据 gpu

本文链接：https://blog.csdn.net/qq_39731130/article/details/108941686

版权

CUDA 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

#include<iostream>
#include<time.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

using namespace std;

#define threadSize 100
#define arrSize 100

void addWithCPU(float (*array)[arrSize]);
void addWithGPU(float (*array)[arrSize]);
__global__ void addKernel(float (*p)[arrSize]);

//共threadSize组，每一组有arrSize个数字，组内进行二重循环计算
//对GPU而言，共分配threadSize个线程，每个线程只需计算一组数据。
//时间复杂度为O(threadSize * arrSize^2)

int main()
{

    //Used in CPU
    float array1[threadSize][arrSize];

    //Used in GPU
    float array2[threadSize][arrSize];

    //Initialize array
    for(int i = 0; i < threadSize; i++)
    {
        for(int j = 0; j < arrSize; j++)
        {
            array1[i][j] = array2[i][j] = 1;
        }
    }

    time_t startCPU, endCPU, startGPU, endGPU;

    startCPU = clock();

    addWithCPU(array1);

    endCPU = clock();

    startGPU = clock();

    addWithGPU(array2);

    endGPU = clock();

    cout << "Time used for CPU is: " << endCPU - startCPU << endl;
    cout << "Time used for GPU is: " << endGPU - startGPU << endl;

    return 1;
}

//CPU computation
void addWithCPU(float (*array)[arrSize])
{
    for(int i = 0; i < threadSize; i++)
    {
        for(int j = 0; j < arrSize; j++)
        {
            for(int k = 0; k < arrSize; k++)
            {
                array[i][j] += array[i][k];
            }
        }
    }
}

//GPU kernel
__global__ void addKernel(float (*p)[arrSize])
{
    int i = threadIdx.x;
    
    for(int j = 0; j < arrSize; j++)
    {
        for(int k = 0; k < arrSize; k++)
        {
            p[i][j] += p[i][k];
        }
    }
}

//GPU computation
void addWithGPU(float (*array)[arrSize])
{
    float (*p)[arrSize] = nullptr;

    cudaSetDevice(0);

    cudaMalloc((void**)&p, threadSize * arrSize * sizeof(float));

    cudaMemcpy(p, array, threadSize * arrSize * sizeof(float), cudaMemcpyHostToDevice);

    addKernel<<<1, arrSize>>>(p);

    cudaMemcpy(array, p, threadSize * arrSize * sizeof(float), cudaMemcpyDeviceToHost);

}

threadSize = 1 , arrSize = 1
threadSize = 1 , arrSize = 1000
threadSize = 10 , arrSize = 1000
threadSize = 100 , arrSize = 100
threadSize = 100 , arrSize = 500
threadSize = 100 , arrSize = 1000
threadSize = 1000 , arrSize = 100
threadSize = 1000 , arrSize = 500
threadSize = 1000 , arrSize = 1000
threadSize = 10000 , arrSize = 100

通过上述时间比对，可以看出GPU的基本处理时间大约在2.8秒左右，这包括设备启动时间，设备关闭时间。随着数据量的增加，CPU和GPU的处理时间差逐渐缩小，可以看出GPU的多线程在处理海量数据时存在优势。

MusicLEEE

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
将GPU多线程的计算能力与CPU进行比较

#include<iostream>#include<time.h>#include "cuda_runtime.h"#include "device_launch_parameters.h"using namespace std;#define threadSize 100#define arrSize 100void addWithCPU(float (*array)[arrSize]);void addWithGPU(float (*array)[arr
复制链接

扫一扫

专栏目录