Cuda实战-09 多线程多卡运行

最新推荐文章于 2023-11-01 21:49:19 发布

梦星辰.

最新推荐文章于 2023-11-01 21:49:19 发布

阅读量651

点赞数 2

分类专栏： Cuda编程技术文章标签： c++ 开发语言

本文链接：https://blog.csdn.net/weixin_45792450/article/details/129690328

版权

Cuda编程技术专栏收录该内容

24 篇文章 4 订阅

订阅专栏

该代码示例展示了如何在Linux系统中，通过多线程技术将任务分配到多个CPU线程，每个线程再在对应的GPU上启用一个核函数进行计算。程序首先获取CPU和GPU的数量，然后创建线程，每个线程负责一部分数据的处理，使用cudaSetDevice根据线程ID选择GPU，并执行加法操作的核函数。最后，主线程等待所有线程完成并检查结果的正确性。

摘要由CSDN通过智能技术生成

说明

样例的内容是每个CPU启用一个线程，执行一个核函数，然后这些线程平均分配给n个GPU上面运行，采用多线程技术实现。

代码


#include <cuda_runtime.h>
#include <pthread.h> //多线程
#include <unistd.h>
#include <stdio.h>  // stdio functions are used since C++ streams aren't necessarily thread safe
#include <pthread.h>
#define address unsigned long long int  //实现指针与整数的灵活转变
int* finish_code;   //检查单独的一个线程是否执行完毕，为0代表执行完毕
// 获取CPU数量 Linux操作系统使用
int Get_CPU_Number_Linux() {
    // #include <unistd.h>
    return sysconf( _SC_NPROCESSORS_CONF);
}

// 获取GPU数量 不限操作系统
int Get_GPU_Number(){
    // #include <cuda_runtime.h>
    int num = 0;
    cudaGetDeviceCount(&num);
    return num;
}


//核函数，执行数组每个元素的固定值增加
__global__ void kernelAddConstant(int *g_a, int b) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    g_a[idx] += b;
}

// 线程运行函数
void* exec(void* args){
    //进行函数声明，否则跨线程无法调用GPU核函数
    __global__ void kernelAddConstant(int *g_a, int b);
    // 将void*指针强制转为address*指针，存储的内容为若干整数，这些整数的数值即为地址
    //addr[0] = cpu_thread_id; addr[1] = n
    //addr[2] = nbytes; addr[3] = b; addr[4] = data
    address* addr = (address*)args;
    // 传入的整数强制转换为指针的地址
    int cpu_thread_id = ((int*)(addr[0]))[0];      //获得CPU线程索引
    int num_cpu_threads = Get_CPU_Number_Linux();   //获得CPU线程数量
    int n = ((int*)(addr[1]))[0];      //获得GPU线程总数
    int nbytes = ((int*)(addr[2]))[0];      //获得数组占用内存大小
    int b = ((int*)(addr[3]))[0];      //获得数组元素自增值
    int* data = ((int*)(addr[4]));      //获得数组数据
    // set and check the CUDA device for this CPU thread
    int gpu_id = -1;
    cudaSetDevice(cpu_thread_id %Get_GPU_Number());  // "% num_gpus" allows more CPU threads than GPU devices
    cudaGetDevice(&gpu_id);
    printf("CPU thread %d (of %d) uses CUDA device %d\n", cpu_thread_id,num_cpu_threads, gpu_id);

    int *d_a = 0;  // pointer to memory on the device associated with this CPU thread
    int *sub_a =
        data +
        cpu_thread_id * n /
        num_cpu_threads;  // pointer to this CPU thread's portion of data
    unsigned int nbytes_per_kernel = nbytes / num_cpu_threads;
    dim3 gpu_threads(128);  // 128 threads per block
    dim3 gpu_blocks(n / (gpu_threads.x * num_cpu_threads));

    cudaMalloc((void **)&d_a, nbytes_per_kernel);
    cudaMemset(d_a, 0, nbytes_per_kernel);
    cudaMemcpy(d_a, sub_a, nbytes_per_kernel, cudaMemcpyHostToDevice);
    kernelAddConstant<< <gpu_blocks, gpu_threads>> >(d_a, b);

    cudaMemcpy(sub_a, d_a, nbytes_per_kernel, cudaMemcpyDeviceToHost);
    cudaFree(d_a);
    finish_code[cpu_thread_id] = 0;
    printf("thread %d finish!\n", cpu_thread_id);
}

//检查数组的每个值是不是index + b
int correctResult(int *data, const int n, const int b) {
    for (int i = 0; i < n; i++)
        if (data[i] != i + b) return 0;
    return 1;
}

int main() {

    //获取CPU芯片数量
    int num_cpus = Get_CPU_Number_Linux();
    //获取GPU芯片数量
    int num_gpus = Get_GPU_Number();
    //如果没有GPU，则直接返回
    if (num_gpus < 1) {
        printf("no CUDA capable devices were detected!\n");
        return 1;
    }
    //打印CPU数量和GPU数量
    printf("number of host CPUs:\t%d\n", num_cpus);
    printf("number of device GPUs:\t%d\n", num_gpus);
    //打印每一个GPU设备的信息
    for (int i = 0; i < num_gpus; i++) {
        cudaDeviceProp dprop;
        cudaGetDeviceProperties(&dprop, i);
        printf("   %d: %s\n", i, dprop.name);
    }
    printf("---------------------------\n");

    /

    // 初始化数据
    int n = num_gpus * 8192;
    int nbytes = n * sizeof(int);
    int *data = 0;  // CPU数据指针
    int b = 3;   // 数组单个元素增加值
    data = (int *)malloc(nbytes);  //CPU内数组分配内存
    //检查CPU是否成功分配内存
    if (0 == data) {
        printf("couldn't allocate CPU memory\n");
        return 1;
    }
    //对CPU里的数组数据进行赋值
    for (int i = 0; i < n; i++) data[i] = i;

    


    printf("---------------------------\n");
    int thread_num = num_cpus;
    int* idx = (int*) malloc(thread_num*sizeof(int));
    finish_code = (int*) malloc(thread_num*sizeof(int));
    for (int i=0;i<thread_num;i++){
        pthread_t thread;
        idx[i] = i;
        finish_code[i] = 1;
        address* addr = (address*) malloc(5*sizeof(address));
        addr[0] = (address)(&(idx[i]));
        addr[1] = (address)(&(n));
        addr[2] = (address)(&(nbytes));
        addr[3] = (address)(&(b));
        addr[4] = (address)(data);
        pthread_create(&thread,NULL,exec,addr);
        pthread_join(thread, NULL);
    }
    
    for (int i=0;i<thread_num;i++){
        //while (finish_code[i]) int a = 0;
    }
    //检查结果是否正确
    bool bResult = correctResult(data, n, b);
    if (data) free(data);  // 释放CPU资源
    if (bResult) printf("SUCCESS!\n");
    if (!bResult) printf("FAIL!\n");
    return  0;
}