一个不错的关于CPU和GPU（CUDA）的性能比较讨论话题

最新推荐文章于 2023-01-06 16:01:11 发布

OpenHero

最新推荐文章于 2023-01-06 16:01:11 发布

阅读量6k

点赞数

分类专栏： CUDA 语言软件工程 C/C++ 文章标签： cuda timer input 测试 function 优化

本文链接：https://blog.csdn.net/OpenHero/article/details/3170147

版权

CUDA 同时被 3 个专栏收录

83 篇文章 4 订阅

订阅专栏

语言

38 篇文章 0 订阅

订阅专栏

C/C++

38 篇文章 0 订阅

订阅专栏

http://topic.csdn.net/u/20081027/23/67ff3857-3c71-4d5c-acf6-095f3497c7a9.html
这里是今天的一个论坛的一个帖子，大家可以讨论一下：）
1.那些程序适合用cpu来做，那些适合用gpu来做
2.如果用gpu来做，需要注意那些东西
3.如果需要优化，需要那些思路：）

在lz的代码的基础上做了一些变化，大家可以自己测试一下，就知道那些工作适合用cpu做，那些是适合用gpu来做。
这里面的LOOP_ADD_TIME 从1->10->100->1000->10000....
大家可以做一个测试，看看最后的效果是怎么样的，可以画一个曲线图出来：）

过一段时间，还可以把这个代码在修改一下，添加更多的内容进去，再看看两者的效果怎么样：）

C/C++ code

   
   
    
    
    
    
#include 
    
    <
    
    stdio.h
    
    >
    
    
#include 
    
    <
    
    assert.h
    
    >
    
    
#include 
    
    <
    
    time.h
    
    >
    
    
#include 
    
    <
    
    cutil.h
    
    >
    
    


    
    //
    
     Simple utility function to check for CUDA runtime errors

    
    //
    
    void checkCUDAError(const char* msg)
    
    

    
    #define
    
     LOOP_ADD_TIME 100
    
    


    
    //
    
     Part 2 of 2: implement the kernel
    
    

    
    __global__ 
    
    void
    
     reverseArrayBlock( 
    
    int
    
    *
    
    d_a)
{  
    
    
    int
    
     dx
    
    =
    
    blockDim.x
    
    *
    
    blockIdx.x
    
    +
    
    threadIdx.x;

    
    
    for
    
     (
    
    int
    
     i 
    
    =
    
     
    
    1
    
    ; i 
    
    <=
    
     LOOP_ADD_TIME; i
    
    ++
    
    )
    {
        d_a[dx] 
    
    +=
    
     i;
    }
}


    
    int
    
     gpu_test()
{
    clock_t start, finish;
    
    
    double
    
     duration;

    
    
    //
    
     pointer for host memory and size
    
    

    
        
    
    int
    
     
    
    *
    
    h_a,transfer;
    
    
    int
    
     dimA 
    
    =
    
     
    
    512
    
    *
    
    21056
    
    ; 
    
    //
    
     256K elements (1MB total)

    
    
    //
    
     pointer for device memory
    
    

    
        
    
    int
    
     
    
    *
    
    d_a;

    
    
    //
    
     define grid and block size
    
    

    
        
    
    int
    
     numThreadsPerBlock 
    
    =
    
    512
    
    ;

    
    
    //
    
     Part 1 of 2: compute number of blocks needed based on array size and desired block size
    
    

    
        
    
    int
    
     numBlocks 
    
    =
    
     dimA
    
    /
    
    numThreadsPerBlock; 
    printf(
    
    "
    
    blocks: %d/n
    
    "
    
    ,numBlocks);

    
    
    //
    
     allocate host and device memory
    
    

    
        size_t memSize 
    
    =
    
     numBlocks 
    
    *
    
     numThreadsPerBlock 
    
    *
    
     
    
    sizeof
    
    (
    
    int
    
    );
    h_a 
    
    =
    
     (
    
    int
    
     
    
    *
    
    ) malloc(memSize);
    CUDA_SAFE_CALL(cudaMalloc( (
    
    void
    
     
    
    **
    
    ) 
    
    &
    
    d_a, memSize ));

    
    
    //
    
     Initialize input array on host
    
    

    
        
    
    for
    
     (
    
    int
    
     i 
    
    =
    
     
    
    0
    
    ; i 
    
    <
    
     dimA; 
    
    ++
    
    i)
    {
        h_a[i] 
    
    =
    
     i;
        
    
    //
    
    printf("%d ",h_a[i]);
    
    

    
        }

    start 
    
    =
    
     clock();
    
    
    //
    
    unsigned int timer;
    
    
    //
    
    CUT_SAFE_CALL(cutCreateTimer(&timer));
    
    
    //
    
    CUT_SAFE_CALL(cutStartTimer(timer));
    
    
    //
    
     Copy host array to device array
    
    

    
        CUDA_SAFE_CALL(cudaMemcpy( d_a, h_a, memSize, cudaMemcpyHostToDevice ));
    
    
    
    //
    
     launch kernel
    
    

    
        dim3 dimGrid(numBlocks);
    dim3 dimBlock(numThreadsPerBlock);
    reverseArrayBlock 
    
    <<<
    
     dimGrid, dimBlock 
    
    >>>
    
    ( d_a );

    
    
    //
    
     device to host copy
    
    

    
        CUDA_SAFE_CALL(cudaMemcpy(h_a, d_a, memSize, cudaMemcpyDeviceToHost ));

    
    
    //
    
    CUT_SAFE_CALL(cutStopTimer(timer));
    
    

    
        finish 
    
    =
    
     clock();
    duration 
    
    =
    
     (
    
    double
    
    )(finish 
    
    -
    
     start)
    
    *
    
    1000
    
     
    
    /
    
     CLOCKS_PER_SEC;
    printf( 
    
    "
    
    gpu time is %f ms/n
    
    "
    
    , duration );
    
    
    //
    
    printf( "gpu time is %f ms/n", cutGetTimerValue(timer));
    
    

    
    

    
    
    int
    
     
    
    *
    
    h_a2;

    
    
    //
    
     allocate host memory
    
    

    
        h_a2 
    
    =
    
     (
    
    int
    
     
    
    *
    
    ) malloc(memSize);


    
    
    //
    
     Initialize input array on host
    
    

    
        
    
    for
    
     (
    
    int
    
     i 
    
    =
    
     
    
    0
    
    ; i 
    
    <
    
     dimA; 
    
    ++
    
    i)
    {
        h_a2[i] 
    
    =
    
     i;
        
    
    //
    
    printf("%d ",h_a[i]);
    
    

    
        }
    
    
    for
    
    ( 
    
    int
    
     j
    
    =
    
    0
    
    ; j 
    
    <
    
     dimA ; 
    
    ++
    
    j )
    { 
        
    
    for
    
    (
    
    int
    
     k 
    
    =
    
     
    
    1
    
    ; k 
    
    <=
    
     LOOP_ADD_TIME; k
    
    ++
    
    )
        {
            h_a2[j] 
    
    +=
    
     k;
        }
    }

    
    
    for
    
    ( 
    
    int
    
     j
    
    =
    
    0
    
    ; j 
    
    <
    
     dimA ; 
    
    ++
    
    j )
    { 
        
    
    if
    
     (h_a[j] 
    
    !=
    
     h_a2[j])printf(
    
    "
    
    error!/n
    
    "
    
    );
    }
    
    
    //
    
     free host memory
    
    

    
        free(h_a2);

    
    
    //
    
     free host memory
    
    

    
        free(h_a);

    
    
    //
    
     free device memory
    
    

    
        cudaFree(d_a);

    
    
    return
    
     
    
    0
    
    ;

}


    
    //
    
    //
    
    

    
    //
    
     Program main
    
    

    
    //
    
    //
    
    

    
    int
    
     cpu_test()
{
    clock_t start, finish;
    
    
    double
    
     duration;

    
    
    //
    
     pointer for host memory and size
    
    

    
        
    
    int
    
     
    
    *
    
    h_a,transfer;
    
    
    int
    
     dimA 
    
    =
    
     
    
    512
    
    *
    
    21056
    
    ; 
    
    //
    
     256K elements (1MB total)

    
    
    //
    
     allocate host memory
    
    

    
        size_t memSize 
    
    =
    
     
    
    512
    
    *
    
    21056
    
    *
    
     
    
    sizeof
    
    (
    
    int
    
    );
    h_a 
    
    =
    
     (
    
    int
    
     
    
    *
    
    ) malloc(memSize);


    
    
    //
    
     Initialize input array on host
    
    

    
        
    
    for
    
     (
    
    int
    
     i 
    
    =
    
     
    
    0
    
    ; i 
    
    <
    
     dimA; 
    
    ++
    
    i)
    {
        h_a[i] 
    
    =
    
     i;
        
    
    //
    
    printf("%d ",h_a[i]);
    
    

    
        }
    printf(
    
    "
    
    /n
    
    "
    
    );

    start 
    
    =
    
     clock();
    
    
    for
    
    ( 
    
    int
    
     j
    
    =
    
    0
    
    ; j 
    
    <
    
     dimA ; 
    
    ++
    
    j )
    { 
        
    
    for
    
    (
    
    int
    
     k 
    
    =
    
     
    
    1
    
    ; k 
    
    <=
    
     LOOP_ADD_TIME; k
    
    ++
    
    )
        {
            h_a[j] 
    
    +=
    
     k;
        }
    }

    finish 
    
    =
    
     clock();
    duration 
    
    =
    
     (
    
    double
    
    )(finish 
    
    -
    
     start)
    
    *
    
    1000
    
     
    
    /
    
     CLOCKS_PER_SEC;
    printf( 
    
    "
    
    cpu time is %f ms/n
    
    "
    
    , duration );


    
    
    //
    
     free host memory
    
    

    
        free(h_a);

    
    
    return
    
     
    
    0
    
    ;

}

    
    //
    
    //
    
    

    
    //
    
     Program main
    
    

    
    //
    
    //
    
    

    
    int
    
     main( 
    
    int
    
     argc, 
    
    char
    
    **
    
     argv)
{

    CUT_DEVICE_INIT(argc, argv);
    gpu_test();

    cpu_test();
    
    CUT_EXIT(argc, argv);
}

OpenHero

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
5
评论
一个不错的关于CPU和GPU（CUDA）的性能比较讨论话题

http://topic.csdn.net/u/20081027/23/67ff3857-3c71-4d5c-acf6-095f3497c7a9.html这里是今天的一个论坛的一个帖子，大家可以讨论一下：）1.那些程序适合用cpu来做，那些适合用gpu来做2.如果用gpu来做，需要注意那些东西3.如果需要优化，需要那些思路：）在lz的代码的基础上做了一些变化，大家可以自己测试一下，就知道那些工作适
复制链接

扫一扫

专栏目录