【高性能计算】CUDA编程之性能测试与流的概念(教程与代码-3)

4 CUDA高级概念的使用

4.1 性能测试与报错处理

  • 研究表明:如果块数量是GPU的流多处理器数量的两倍,则会给出最佳性能,不过,块和线程的数量和具体的算法实现有关。
  • 块中的线程数量应当被设定等于设备属性中每个块所能支持的最大线程数量,但实际上这些数值只是作为一种基本的准则来说的。
  • 常见的内核执行有3个瓶颈:卡在计算瓶颈上,卡在访存上和卡在延迟掩盖上。
  • 具体显卡上通过Profiler分析。哪种资源先达到瓶颈,就减少这种资源的使用(计算或者访存),而增加另外一种,并非一味地增加计算,或者减少访存。
  • 跨步式访存的效果不如合并访存好,但依然比随机访存要好。所以,如果你尝试在程序中使用合并访存的话,它有时会对提升性能有帮助。
  • 流多处理器(SM)在跑同一个warp中的所有thread时必须执行相同的指令(SIMT),如果线程遇到控制流语句进入不同的分支,那么同一时刻会出现部分线程运行,部分线程阻塞的情况(stall execution)比如if语句的判断,奇数和偶数ID的线程将会分别执行不同的代码
  • malloc函数在CPU上分配内存,该函数分配的是可换页的标准内存。 cudaHostAlloc的
  • API函数,该函数分配的是锁定页面的内存。这种内存也叫Pinned内存。操作系统会保证永远不会将这种内存换页到磁盘上,总是在物理内存中。所以,系统内的所有设备都可以直接用该段内存缓冲区的物理地址来访问。此属性帮助GPU通过直接内存访问(DMA)将数据复制到主机或从主机复制数据,而无需CPU干预。但是锁定页面的内存应当正确地使用,不能使用过多,因为这种内存不能被换页到磁盘上,分配的过多,你的系统可能会物理内存不足,从而其他在这个系统上运行的应用程序可能会受到影响。
  • GPU上是通过使用CUDA流来实现任务并行的
  • CUDA流(中的传输)需要使用页面锁定内存,所以我们这里使用了cudaHostAlloc函数,而不是常规的malloc进行内存分配
  • 每个流中的工作是串行的,而流和流之间则默认不保证顺序。
  • 使用cuda API计算耗时

#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N   50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
    //Getting Thread index of current kernel
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N)
    {
        d_c[tid] = d_a[tid] + d_b[tid];
        tid += blockDim.x * gridDim.x;
    }
}
int main(void) {
    //Defining host arrays
    int h_a[N], h_b[N], h_c[N];
    //Defining device pointers
    int *d_a, *d_b, *d_c;
    cudaEvent_t e_start, e_stop;  // 创建事件
    cudaEventCreate(&e_start);
    cudaEventCreate(&e_stop);
    cudaEventRecord(e_start, 0);
    // allocate the memory
    cudaMalloc((void**)&d_a, N * sizeof(int));
    cudaMalloc((void**)&d_b, N * sizeof(int));
    cudaMalloc((void**)&d_c, N * sizeof(int));
    //Initializing Arrays
    for (int i = 0; i < N; i++) {
        h_a[i] = 2 * i*i;
        h_b[i] = i;
    }
    // Copy input arrays from host to device memory
    cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
    //Calling kernels passing device pointers as parameters
    gpuAdd << <512, 512 >> >(d_a, d_b, d_c);
    //Copy result back to host memory from device memory
    cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
    cudaEventRecord(e_stop, 0);
    cudaEventSynchronize(e_stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
    printf("Time to add %d numbers: %3.1f ms\n",N, elapsedTime);
    int Correct = 1;
    printf("Vector addition on GPU \n");
    //Printing result on console
    for (int i = 0; i < N; i++) {
        if ((h_a[i] + h_b[i] != h_c[i]))
        {
            Correct = 0;
        }
    }
    if (Correct == 1)
    {
        printf("GPU has computed Sum Correctly\n");
    }
    else
    {
        printf("There is an Error in GPU Computation\n");
    }
    //Free up memory
    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    return 0;
}
  • 根据报错寻找问题,goto到报错位置
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N   50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
    //Getting Thread index of current kernel
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N)
    {
        d_c[tid] = d_a[tid] + d_b[tid];
        tid += blockDim.x * gridDim.x;
    }
}
int main(void) {
    //Defining host arrays
    int *h_a, *h_b, *h_c;
    //Defining device pointers for stream 0
    int *d_a0, *d_b0, *d_c0;
    //Defining device pointers for stream 1
    int *d_a1, *d_b1, *d_c1;
    cudaStream_t stream0, stream1;
    cudaStreamCreate(&stream0);
    cudaStreamCreate(&stream1);
    cudaEvent_t e_start, e_stop;
    cudaEventCreate(&e_start);
    cudaEventCreate(&e_stop);
    cudaEventRecord(e_start, 0);
    cudaHostAlloc((void**)&h_a,N *2* sizeof(int),cudaHostAllocDefault);
    cudaHostAlloc((void**)&h_b, N *2* sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc((void**)&h_c, N *2*sizeof(int), cudaHostAllocDefault);
    // allocate the memory
    cudaMalloc((void**)&d_a0, N * sizeof(int));
    cudaMalloc((void**)&d_b0, N * sizeof(int));
    cudaMalloc((void**)&d_c0, N * sizeof(int));
    cudaMalloc((void**)&d_a1, N * sizeof(int));
    cudaMalloc((void**)&d_b1, N * sizeof(int));
    cudaMalloc((void**)&d_c1, N * sizeof(int));
    //Initializing Arrays
    for (int i = 0; i < N*2; i++) {
        h_a[i] = 2 * i*i;
        h_b[i] = i;
    }
    
    cudaMemcpyAsync(d_a0, h_a , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
    cudaMemcpyAsync(d_a1, h_a+ N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
    cudaMemcpyAsync(d_b0, h_b , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
    cudaMemcpyAsync(d_b1, h_b + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
    //Calling kernels passing device pointers as parameters
    gpuAdd << <512, 512, 0, stream0 >> > (d_a0, d_b0, d_c0);
    gpuAdd << <512, 512, 0, stream1 >> > (d_a1, d_b1, d_c1);
    //Copy result back to host memory from device memory
    cudaMemcpyAsync(h_c , d_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
    cudaMemcpyAsync(h_c + N, d_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
    
    cudaDeviceSynchronize();
    cudaStreamSynchronize(stream0);
    cudaStreamSynchronize(stream1);
    cudaEventRecord(e_stop, 0);
    cudaEventSynchronize(e_stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
    printf("Time to add %d numbers: %3.1f ms\n",2* N, elapsedTime);
    int Correct = 1;
    printf("Vector addition on GPU \n");
    //Printing result on console
    for (int i = 0; i < 2*N; i++) {
        if ((h_a[i] + h_b[i] != h_c[i]))
        {
            Correct = 0;
        }
    }
    if (Correct == 1)
    {
        printf("GPU has computed Sum Correctly\n");
    }
    else
    {
        printf("There is an Error in GPU Computation\n");
    }
    //Free up memory
    cudaFree(d_a0);
    cudaFree(d_b0);
    cudaFree(d_c0);
    cudaFree(d_a0);
    cudaFree(d_b0);
    cudaFree(d_c0);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    return 0;
}

4.2 性能提升

  • 多流处理,进行并行计算
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N   50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
    //Getting Thread index of current kernel
    int tid = threadIdx.x + blockIdx.x * blockDim.x;
    while (tid < N)
    {
        d_c[tid] = d_a[tid] + d_b[tid];
        tid += blockDim.x * gridDim.x;
    }
}
int main(void) {
    //Defining host arrays
    int *h_a, *h_b, *h_c;
    //Defining device pointers for stream 0
    int *d_a0, *d_b0, *d_c0;
    //Defining device pointers for stream 1
    int *d_a1, *d_b1, *d_c1;
    cudaStream_t stream0, stream1;
    cudaStreamCreate(&stream0);
    cudaStreamCreate(&stream1);
    cudaEvent_t e_start, e_stop;
    cudaEventCreate(&e_start);
    cudaEventCreate(&e_stop);
    cudaEventRecord(e_start, 0);
    cudaHostAlloc((void**)&h_a,N *2* sizeof(int),cudaHostAllocDefault);
    cudaHostAlloc((void**)&h_b, N *2* sizeof(int), cudaHostAllocDefault);
    cudaHostAlloc((void**)&h_c, N *2*sizeof(int), cudaHostAllocDefault);
    // allocate the memory
    cudaMalloc((void**)&d_a0, N * sizeof(int));
    cudaMalloc((void**)&d_b0, N * sizeof(int));
    cudaMalloc((void**)&d_c0, N * sizeof(int));
    cudaMalloc((void**)&d_a1, N * sizeof(int));
    cudaMalloc((void**)&d_b1, N * sizeof(int));
    cudaMalloc((void**)&d_c1, N * sizeof(int));
    //Initializing Arrays
    for (int i = 0; i < N*2; i++) {
        h_a[i] = 2 * i*i;
        h_b[i] = i;
    }
    
    cudaMemcpyAsync(d_a0, h_a , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
    cudaMemcpyAsync(d_a1, h_a+ N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
    cudaMemcpyAsync(d_b0, h_b , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
    cudaMemcpyAsync(d_b1, h_b + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
    //Calling kernels passing device pointers as parameters
    gpuAdd << <512, 512, 0, stream0 >> > (d_a0, d_b0, d_c0);
    gpuAdd << <512, 512, 0, stream1 >> > (d_a1, d_b1, d_c1);
    //Copy result back to host memory from device memory
    cudaMemcpyAsync(h_c , d_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
    cudaMemcpyAsync(h_c + N, d_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
    
    cudaDeviceSynchronize();
    cudaStreamSynchronize(stream0);
    cudaStreamSynchronize(stream1);
    cudaEventRecord(e_stop, 0);
    cudaEventSynchronize(e_stop);
    float elapsedTime;
    cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
    printf("Time to add %d numbers: %3.1f ms\n",2* N, elapsedTime);
    int Correct = 1;
    printf("Vector addition on GPU \n");
    //Printing result on console
    for (int i = 0; i < 2*N; i++) {
        if ((h_a[i] + h_b[i] != h_c[i]))
        {
            Correct = 0;
        }
    }
    if (Correct == 1)
    {
        printf("GPU has computed Sum Correctly\n");
    }
    else
    {
        printf("There is an Error in GPU Computation\n");
    }
    //Free up memory
    cudaFree(d_a0);
    cudaFree(d_b0);
    cudaFree(d_c0);
    cudaFree(d_a0);
    cudaFree(d_b0);
    cudaFree(d_c0);
    cudaFreeHost(h_a);
    cudaFreeHost(h_b);
    cudaFreeHost(h_c);
    return 0;
}
  • 使用多线程计算排序问题
  • 用tid变量保存块中的当前线程索引。
  • 而ttid变量则用来表示所有的块中的当前线程的唯一索引,或者说整个Grid中的当前线程的索引。
  • 使用共享内存来减少直接访问全局内存的时间。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define arraySize 5
#define threadPerBlock 5
__global__ void addKernel(int *d_a, int *d_b)
{
    int count = 0;
    int tid = threadIdx.x;
    int ttid = blockIdx.x * threadPerBlock + tid;
    int val = d_a[ttid];
    __shared__ int cache[threadPerBlock];  // 加速计算
    for (int i = tid; i < arraySize; i += threadPerBlock) {
        cache[tid] = d_a[i];
        __syncthreads();
        for (int j = 0; j < threadPerBlock; ++j)
            if (val > cache[j])
                count++;
        __syncthreads();
    }
    d_b[count] = val;
}
int main()
{
    
    int h_a[arraySize] = { 5, 9, 3, 4, 8 };
    int h_b[arraySize];
    int *d_a, *d_b;
     
    cudaMalloc((void**)&d_b, arraySize * sizeof(int));
    cudaMalloc((void**)&d_a, arraySize * sizeof(int));
    
    // Copy input vector from host memory to GPU buffers.
    cudaMemcpy(d_a, h_a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
    
    // Launch a kernel on the GPU with one thread for each element.
    addKernel<<<arraySize/threadPerBlock, threadPerBlock>>>(d_a, d_b);
    cudaDeviceSynchronize();
    // Copy output vector from GPU buffer to host memory.
    cudaMemcpy(h_b, d_b, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
    printf("The Enumeration sorted Array is: \n");
    for (int i = 0; i < arraySize; i++) {
        printf("%d\n", h_b[i]);
    }
    
    cudaFree(d_a);
    cudaFree(d_b);
    return 0;
}
  • 直方图统计代码,使用原子操作,以及使用共享内存加速
#include <stdio.h>
#include <cuda_runtime.h>
#define SIZE 1000
#define NUM_BIN 16
__global__ void histogram_shared_memory(int *d_b, int *d_a)
{
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    int offset = blockDim.x * gridDim.x;
    __shared__ int cache[256];
    cache[threadIdx.x] = 0;
    __syncthreads();
    
    while (tid < SIZE)
    {
        atomicAdd(&(cache[d_a[tid]]), 1);
        tid += offset;
    }
    __syncthreads();
    atomicAdd(&(d_b[threadIdx.x]), cache[threadIdx.x]);
}
__global__ void histogram_without_atomic(int *d_b, int *d_a)
{
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    int item = d_a[tid];
    if (tid < SIZE)
    {
        d_b[item]++;
    }
    
}
__global__ void histogram_atomic(int *d_b, int *d_a)
{
    int tid = threadIdx.x + blockDim.x * blockIdx.x;
    int item = d_a[tid];
    if (tid < SIZE)
    {
        atomicAdd(&(d_b[item]), 1);
    }
}
int main()
{
    
    int h_a[SIZE];
    for (int i = 0; i < SIZE; i++) {
        
        h_a[i] = i % NUM_BIN;
    }
    int h_b[NUM_BIN];
    for (int i = 0; i < NUM_BIN; i++) {
        h_b[i] = 0;
    }
    // declare GPU memory pointers
    int * d_a;
    int * d_b;
    // allocate GPU memory
    cudaMalloc((void **)&d_a, SIZE * sizeof(int));
    cudaMalloc((void **)&d_b, NUM_BIN * sizeof(int));
    // transfer the arrays to the GPU
    cudaMemcpy(d_a, h_a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
    cudaMemcpy(d_b, h_b, NUM_BIN * sizeof(int), cudaMemcpyHostToDevice);
    
    // launch the kernel
    // histogram_without_atomic << <((SIZE+NUM_BIN-1) / NUM_BIN), NUM_BIN >> >(d_b, d_a);
    //histogram_atomic << <((SIZE+NUM_BIN-1) / NUM_BIN), NUM_BIN >> >(d_b, d_a);
    histogram_shared_memory << <((SIZE+NUM_BIN-1) / NUM_BIN), NUM_BIN >> >(d_b, d_a);
        
    // copy back the sum from GPU
    cudaMemcpy(h_b, d_b, NUM_BIN * sizeof(int), cudaMemcpyDeviceToHost);
    printf("Histogram using 16 bin without shared Memory is: \n");
    for (int i = 0; i < NUM_BIN; i++) {
        printf("bin %d: count %d\n", i, h_b[i]);
    }
    // free GPU memory allocation
    cudaFree(d_a);
    cudaFree(d_b);
    return 0;
}
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
《GPU高性能编程CUDA实战》是一本介绍GPU编程技术的实用指南。它以CUDA(Compute Unified Device Architecture)为核心,向读者展示如何利用GPU的并行计算能力来加速各种应用程序。 本书首先对GPU架构和CUDA编程模型进行了全面介绍,并讲解了CUDA的基本概念编程原理。读者可以从中了解到GPU相对于传统CPU的优势,以及如何利用CUDA进行高效的并行编程。 接着,书中详细讲解了CUDA编程语言和工具。读者将学会如何使用CUDA C/C++来编写并行计算代码,并通过实际案例演示了如何调试和优化CUDA程序。此外,本书还介绍了NVIDIA的性能分析工具和CUDA GPUs的内存管理技巧,帮助读者更好地利用GPU的性能。 《GPU高性能编程CUDA实战》的一个重要特点是它提供了大量的实例代码和实战案例。通过这些案例,读者可以了解到如何用CUDA来加速图像处理、矩阵运算、深度学习等各种应用。这些示例代码都经过优化和测试,读者可以直接运行和验证,并通过它们来学习和实践CUDA编程技术。 总之,《GPU高性能编程CUDA实战》是一本很好的学习资源,特别适合对并行计算和GPU编程感兴趣的开发人员和研究者。通过阅读本书,读者可以系统地了解CUDA编程的基本概念和技术,并通过实战案例来提高自己的GPU编程能力。无论是想加速现有应用程序,还是开发新的GPU应用,都能从这本书中获得很多实用的知识和经验。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值