cuda多gpu编程11 多gpu进行运算

最新推荐文章于 2024-06-28 22:56:55 发布

雨浅听风吟

最新推荐文章于 2024-06-28 22:56:55 发布

阅读量1.2k

点赞数

分类专栏：计算机硬件

本文链接：https://blog.csdn.net/weixin_39289876/article/details/119839958

版权

计算机硬件专栏收录该内容

16 篇文章 4 订阅

订阅专栏

本文档展示了在使用四块GPU处理数据时，相比于单块GPU仅实现了一倍的速度提升。代码中包含了CUDA加速的解密操作，但并未达到预期的加速效果。程序包括CPU加密、GPU解密及结果验证过程，并详细记录了各步骤的时间消耗。

摘要由CSDN通过智能技术生成

在这里插入图片描述
四块gpu相比一块也只快了一倍
处理效果如下，全部使用了默认流，没有进行cuda加速

在这里插入图片描述没改之前

#include <cstdint>
#include <iostream>
#include "helpers.cuh"
#include "encryption.cuh"

void encrypt_cpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters, bool parallel=true) {

    #pragma omp parallel for if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        data[entry] = permute64(entry, num_iters);
}

__global__ 
void decrypt_gpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters) {

    const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
    const uint64_t stride = blockDim.x*gridDim.x;

    for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
        data[entry] = unpermute64(data[entry], num_iters);
}

bool check_result_cpu(uint64_t * data, uint64_t num_entries,
                      bool parallel=true) {

    uint64_t counter = 0;

    #pragma omp parallel for reduction(+: counter) if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        counter += data[entry] == entry;

    return counter == num_entries;
}

int main (int argc, char * argv[]) {

    Timer timer;
    Timer overall;

    const uint64_t num_entries = 1UL << 26;
    const uint64_t num_iters = 1UL << 10;
    const bool openmp = true;

    timer.start();
    uint64_t * data_cpu, * data_gpu;
    cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
    cudaMalloc    (&data_gpu, sizeof(uint64_t)*num_entries);
    timer.stop("allocate memory");
    check_last_error();

    timer.start();
    encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
    timer.stop("encrypt data on CPU");

    overall.start();
    timer.start();
    cudaMemcpy(data_gpu, data_cpu, 
               sizeof(uint64_t)*num_entries, cudaMemcpyHostToDevice);
    timer.stop("copy data from CPU to GPU");
    check_last_error();

    timer.start();
    decrypt_gpu<<<80*32, 64>>>(data_gpu, num_entries, num_iters);
    timer.stop("decrypt data on GPU");
    check_last_error();

    timer.start();
    cudaMemcpy(data_cpu, data_gpu, 
               sizeof(uint64_t)*num_entries, cudaMemcpyDeviceToHost);
    timer.stop("copy data from GPU to CPU");
    overall.stop("total time on GPU");
    check_last_error();

    timer.start();
    const bool success = check_result_cpu(data_cpu, num_entries, openmp);
    std::cout << "STATUS: test " 
              << ( success ? "passed" : "failed")
              << std::endl;
    timer.stop("checking result on CPU");

    timer.start();
    cudaFreeHost(data_cpu);
    cudaFree    (data_gpu);
    timer.stop("free memory");
    check_last_error();
}

结果文档 mgpu_solution.cu

#include <cstdint>
#include <iostream>
#include "helpers.cuh"
#include "encryption.cuh"

void encrypt_cpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters, bool parallel=true) {

    #pragma omp parallel for if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        data[entry] = permute64(entry, num_iters);
}

__global__ 
void decrypt_gpu(uint64_t * data, uint64_t num_entries, 
                 uint64_t num_iters) {

    const uint64_t thrdID = blockIdx.x*blockDim.x+threadIdx.x;
    const uint64_t stride = blockDim.x*gridDim.x;

    for (uint64_t entry = thrdID; entry < num_entries; entry += stride)
        data[entry] = unpermute64(data[entry], num_iters);
}

bool check_result_cpu(uint64_t * data, uint64_t num_entries,
                      bool parallel=true) {

    uint64_t counter = 0;

    #pragma omp parallel for reduction(+: counter) if (parallel)
    for (uint64_t entry = 0; entry < num_entries; entry++)
        counter += data[entry] == entry;

    return counter == num_entries;
}

int main (int argc, char * argv[]) {

    Timer timer;
    Timer overall;

    const uint64_t num_entries = 1UL << 26;
    const uint64_t num_iters = 1UL << 10;
    const bool openmp = true;

    // Set number of available GPUs.
    const uint64_t num_gpus = 4;
    // Get chunk size using round up division.
    const uint64_t chunk_size = sdiv(num_entries, num_gpus);

    timer.start();
    // Use array of pointers for multiple GPU memory.
    uint64_t * data_cpu, * data_gpu[num_gpus];
    cudaMallocHost(&data_cpu, sizeof(uint64_t)*num_entries);
    
    // For each GPU...
    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {

        // ...set GPU as active...
        cudaSetDevice(gpu);

        // ...get width of this GPUs data chunk...
        const uint64_t lower = chunk_size*gpu;
        const uint64_t upper = min(lower+chunk_size, num_entries);
        const uint64_t width = upper-lower;

        // ...allocate data for this GPU.
        cudaMalloc(&data_gpu[gpu], sizeof(uint64_t)*width);
    }    
    timer.stop("allocate memory");
    check_last_error();

    timer.start();
    encrypt_cpu(data_cpu, num_entries, num_iters, openmp);
    timer.stop("encrypt data on CPU");

    overall.start();
    timer.start();
    
    // For each GPU...
    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {

        cudaSetDevice(gpu);

        const uint64_t lower = chunk_size*gpu;
        const uint64_t upper = min(lower+chunk_size, num_entries);
        const uint64_t width = upper-lower;

        // ...copy correct chunk of data to active GPU.
        cudaMemcpy(data_gpu[gpu], data_cpu+lower, 
               sizeof(uint64_t)*width, cudaMemcpyHostToDevice);
    }
    timer.stop("copy data from CPU to GPU");
    check_last_error();

    timer.start();
    
    // For each GPU...
    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {

        cudaSetDevice(gpu);

        const uint64_t lower = chunk_size*gpu;
        const uint64_t upper = min(lower+chunk_size, num_entries);
        const uint64_t width = upper-lower;
        
        // ...decrypt its chunk of data.
        decrypt_gpu<<<80*32, 64>>>(data_gpu[gpu], width, num_iters);
    }
    timer.stop("decrypt data on the GPU");
    check_last_error();

    timer.start();
    
    // For each GPU...
    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {

        cudaSetDevice(gpu);

        const uint64_t lower = chunk_size*gpu;
        const uint64_t upper = min(lower+chunk_size, num_entries);
        const uint64_t width = upper-lower;

        // ...copy its chunk of data back to the host.
        cudaMemcpy(data_cpu+lower, data_gpu[gpu], 
                   sizeof(uint64_t)*width, cudaMemcpyDeviceToHost);
    }
    timer.stop("copy data from GPU to CPU");
    overall.stop("total time on GPU");
    check_last_error();

    timer.start();
    const bool success = check_result_cpu(data_cpu, num_entries, openmp);
    std::cout << "STATUS: test " 
              << ( success ? "passed" : "failed")
              << std::endl;
    timer.stop("checking result on CPU");

    timer.start();
    cudaFreeHost(data_cpu);
    for (uint64_t gpu = 0; gpu < num_gpus; gpu++) {

        cudaSetDevice(gpu);
        cudaFree(data_gpu[gpu]);
    }
    timer.stop("free memory");
    check_last_error();
}

雨浅听风吟

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
cuda多gpu编程11 多gpu进行运算

四块gpu相比一块也只快了一倍处理效果如下，全部使用了默认流，没有进行cuda加速没改之前#include <cstdint>#include <iostream>#include "helpers.cuh"#include "encryption.cuh"void encrypt_cpu(uint64_t * data, uint64_t num_entries, uint64_t num_iters, bool parall.
复制链接

扫一扫

专栏目录