4 CUDA高级概念的使用
4.1 性能测试与报错处理
- 研究表明:如果块数量是GPU的流多处理器数量的两倍,则会给出最佳性能,不过,块和线程的数量和具体的算法实现有关。
- 块中的线程数量应当被设定等于设备属性中每个块所能支持的最大线程数量,但实际上这些数值只是作为一种基本的准则来说的。
- 常见的内核执行有3个瓶颈:卡在计算瓶颈上,卡在访存上和卡在延迟掩盖上。
- 具体显卡上通过Profiler分析。哪种资源先达到瓶颈,就减少这种资源的使用(计算或者访存),而增加另外一种,并非一味地增加计算,或者减少访存。
- 跨步式访存的效果不如合并访存好,但依然比随机访存要好。所以,如果你尝试在程序中使用合并访存的话,它有时会对提升性能有帮助。
- 流多处理器(SM)在跑同一个warp中的所有thread时必须执行相同的指令(SIMT),如果线程遇到控制流语句进入不同的分支,那么同一时刻会出现部分线程运行,部分线程阻塞的情况(stall execution)比如if语句的判断,奇数和偶数ID的线程将会分别执行不同的代码
- malloc函数在CPU上分配内存,该函数分配的是可换页的标准内存。 cudaHostAlloc的
- API函数,该函数分配的是锁定页面的内存。这种内存也叫Pinned内存。操作系统会保证永远不会将这种内存换页到磁盘上,总是在物理内存中。所以,系统内的所有设备都可以直接用该段内存缓冲区的物理地址来访问。此属性帮助GPU通过直接内存访问(DMA)将数据复制到主机或从主机复制数据,而无需CPU干预。但是锁定页面的内存应当正确地使用,不能使用过多,因为这种内存不能被换页到磁盘上,分配的过多,你的系统可能会物理内存不足,从而其他在这个系统上运行的应用程序可能会受到影响。
- GPU上是通过使用CUDA流来实现任务并行的
- CUDA流(中的传输)需要使用页面锁定内存,所以我们这里使用了cudaHostAlloc函数,而不是常规的malloc进行内存分配
- 每个流中的工作是串行的,而流和流之间则默认不保证顺序。
- 使用cuda API计算耗时
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N 50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
//Getting Thread index of current kernel
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N)
{
d_c[tid] = d_a[tid] + d_b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main(void) {
//Defining host arrays
int h_a[N], h_b[N], h_c[N];
//Defining device pointers
int *d_a, *d_b, *d_c;
cudaEvent_t e_start, e_stop; // 创建事件
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
cudaEventRecord(e_start, 0);
// allocate the memory
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
cudaMalloc((void**)&d_c, N * sizeof(int));
//Initializing Arrays
for (int i = 0; i < N; i++) {
h_a[i] = 2 * i*i;
h_b[i] = i;
}
// Copy input arrays from host to device memory
cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
//Calling kernels passing device pointers as parameters
gpuAdd << <512, 512 >> >(d_a, d_b, d_c);
//Copy result back to host memory from device memory
cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
cudaEventRecord(e_stop, 0);
cudaEventSynchronize(e_stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("Time to add %d numbers: %3.1f ms\n",N, elapsedTime);
int Correct = 1;
printf("Vector addition on GPU \n");
//Printing result on console
for (int i = 0; i < N; i++) {
if ((h_a[i] + h_b[i] != h_c[i]))
{
Correct = 0;
}
}
if (Correct == 1)
{
printf("GPU has computed Sum Correctly\n");
}
else
{
printf("There is an Error in GPU Computation\n");
}
//Free up memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
- 根据报错寻找问题,goto到报错位置
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N 50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
//Getting Thread index of current kernel
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N)
{
d_c[tid] = d_a[tid] + d_b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main(void) {
//Defining host arrays
int *h_a, *h_b, *h_c;
//Defining device pointers for stream 0
int *d_a0, *d_b0, *d_c0;
//Defining device pointers for stream 1
int *d_a1, *d_b1, *d_c1;
cudaStream_t stream0, stream1;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
cudaEvent_t e_start, e_stop;
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
cudaEventRecord(e_start, 0);
cudaHostAlloc((void**)&h_a,N *2* sizeof(int),cudaHostAllocDefault);
cudaHostAlloc((void**)&h_b, N *2* sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**)&h_c, N *2*sizeof(int), cudaHostAllocDefault);
// allocate the memory
cudaMalloc((void**)&d_a0, N * sizeof(int));
cudaMalloc((void**)&d_b0, N * sizeof(int));
cudaMalloc((void**)&d_c0, N * sizeof(int));
cudaMalloc((void**)&d_a1, N * sizeof(int));
cudaMalloc((void**)&d_b1, N * sizeof(int));
cudaMalloc((void**)&d_c1, N * sizeof(int));
//Initializing Arrays
for (int i = 0; i < N*2; i++) {
h_a[i] = 2 * i*i;
h_b[i] = i;
}
cudaMemcpyAsync(d_a0, h_a , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(d_a1, h_a+ N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(d_b0, h_b , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(d_b1, h_b + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
//Calling kernels passing device pointers as parameters
gpuAdd << <512, 512, 0, stream0 >> > (d_a0, d_b0, d_c0);
gpuAdd << <512, 512, 0, stream1 >> > (d_a1, d_b1, d_c1);
//Copy result back to host memory from device memory
cudaMemcpyAsync(h_c , d_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(h_c + N, d_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
cudaDeviceSynchronize();
cudaStreamSynchronize(stream0);
cudaStreamSynchronize(stream1);
cudaEventRecord(e_stop, 0);
cudaEventSynchronize(e_stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("Time to add %d numbers: %3.1f ms\n",2* N, elapsedTime);
int Correct = 1;
printf("Vector addition on GPU \n");
//Printing result on console
for (int i = 0; i < 2*N; i++) {
if ((h_a[i] + h_b[i] != h_c[i]))
{
Correct = 0;
}
}
if (Correct == 1)
{
printf("GPU has computed Sum Correctly\n");
}
else
{
printf("There is an Error in GPU Computation\n");
}
//Free up memory
cudaFree(d_a0);
cudaFree(d_b0);
cudaFree(d_c0);
cudaFree(d_a0);
cudaFree(d_b0);
cudaFree(d_c0);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
cudaFreeHost(h_c);
return 0;
}
4.2 性能提升
- 多流处理,进行并行计算
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N 50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
//Getting Thread index of current kernel
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N)
{
d_c[tid] = d_a[tid] + d_b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main(void) {
//Defining host arrays
int *h_a, *h_b, *h_c;
//Defining device pointers for stream 0
int *d_a0, *d_b0, *d_c0;
//Defining device pointers for stream 1
int *d_a1, *d_b1, *d_c1;
cudaStream_t stream0, stream1;
cudaStreamCreate(&stream0);
cudaStreamCreate(&stream1);
cudaEvent_t e_start, e_stop;
cudaEventCreate(&e_start);
cudaEventCreate(&e_stop);
cudaEventRecord(e_start, 0);
cudaHostAlloc((void**)&h_a,N *2* sizeof(int),cudaHostAllocDefault);
cudaHostAlloc((void**)&h_b, N *2* sizeof(int), cudaHostAllocDefault);
cudaHostAlloc((void**)&h_c, N *2*sizeof(int), cudaHostAllocDefault);
// allocate the memory
cudaMalloc((void**)&d_a0, N * sizeof(int));
cudaMalloc((void**)&d_b0, N * sizeof(int));
cudaMalloc((void**)&d_c0, N * sizeof(int));
cudaMalloc((void**)&d_a1, N * sizeof(int));
cudaMalloc((void**)&d_b1, N * sizeof(int));
cudaMalloc((void**)&d_c1, N * sizeof(int));
//Initializing Arrays
for (int i = 0; i < N*2; i++) {
h_a[i] = 2 * i*i;
h_b[i] = i;
}
cudaMemcpyAsync(d_a0, h_a , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(d_a1, h_a+ N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
cudaMemcpyAsync(d_b0, h_b , N * sizeof(int), cudaMemcpyHostToDevice, stream0);
cudaMemcpyAsync(d_b1, h_b + N, N * sizeof(int), cudaMemcpyHostToDevice, stream1);
//Calling kernels passing device pointers as parameters
gpuAdd << <512, 512, 0, stream0 >> > (d_a0, d_b0, d_c0);
gpuAdd << <512, 512, 0, stream1 >> > (d_a1, d_b1, d_c1);
//Copy result back to host memory from device memory
cudaMemcpyAsync(h_c , d_c0, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
cudaMemcpyAsync(h_c + N, d_c1, N * sizeof(int), cudaMemcpyDeviceToHost, stream0);
cudaDeviceSynchronize();
cudaStreamSynchronize(stream0);
cudaStreamSynchronize(stream1);
cudaEventRecord(e_stop, 0);
cudaEventSynchronize(e_stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, e_start, e_stop);
printf("Time to add %d numbers: %3.1f ms\n",2* N, elapsedTime);
int Correct = 1;
printf("Vector addition on GPU \n");
//Printing result on console
for (int i = 0; i < 2*N; i++) {
if ((h_a[i] + h_b[i] != h_c[i]))
{
Correct = 0;
}
}
if (Correct == 1)
{
printf("GPU has computed Sum Correctly\n");
}
else
{
printf("There is an Error in GPU Computation\n");
}
//Free up memory
cudaFree(d_a0);
cudaFree(d_b0);
cudaFree(d_c0);
cudaFree(d_a0);
cudaFree(d_b0);
cudaFree(d_c0);
cudaFreeHost(h_a);
cudaFreeHost(h_b);
cudaFreeHost(h_c);
return 0;
}
- 使用多线程计算排序问题
- 用tid变量保存块中的当前线程索引。
- 而ttid变量则用来表示所有的块中的当前线程的唯一索引,或者说整个Grid中的当前线程的索引。
- 使用共享内存来减少直接访问全局内存的时间。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define arraySize 5
#define threadPerBlock 5
__global__ void addKernel(int *d_a, int *d_b)
{
int count = 0;
int tid = threadIdx.x;
int ttid = blockIdx.x * threadPerBlock + tid;
int val = d_a[ttid];
__shared__ int cache[threadPerBlock]; // 加速计算
for (int i = tid; i < arraySize; i += threadPerBlock) {
cache[tid] = d_a[i];
__syncthreads();
for (int j = 0; j < threadPerBlock; ++j)
if (val > cache[j])
count++;
__syncthreads();
}
d_b[count] = val;
}
int main()
{
int h_a[arraySize] = { 5, 9, 3, 4, 8 };
int h_b[arraySize];
int *d_a, *d_b;
cudaMalloc((void**)&d_b, arraySize * sizeof(int));
cudaMalloc((void**)&d_a, arraySize * sizeof(int));
// Copy input vector from host memory to GPU buffers.
cudaMemcpy(d_a, h_a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
// Launch a kernel on the GPU with one thread for each element.
addKernel<<<arraySize/threadPerBlock, threadPerBlock>>>(d_a, d_b);
cudaDeviceSynchronize();
// Copy output vector from GPU buffer to host memory.
cudaMemcpy(h_b, d_b, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
printf("The Enumeration sorted Array is: \n");
for (int i = 0; i < arraySize; i++) {
printf("%d\n", h_b[i]);
}
cudaFree(d_a);
cudaFree(d_b);
return 0;
}
- 直方图统计代码,使用原子操作,以及使用共享内存加速
#include <stdio.h>
#include <cuda_runtime.h>
#define SIZE 1000
#define NUM_BIN 16
__global__ void histogram_shared_memory(int *d_b, int *d_a)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int offset = blockDim.x * gridDim.x;
__shared__ int cache[256];
cache[threadIdx.x] = 0;
__syncthreads();
while (tid < SIZE)
{
atomicAdd(&(cache[d_a[tid]]), 1);
tid += offset;
}
__syncthreads();
atomicAdd(&(d_b[threadIdx.x]), cache[threadIdx.x]);
}
__global__ void histogram_without_atomic(int *d_b, int *d_a)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int item = d_a[tid];
if (tid < SIZE)
{
d_b[item]++;
}
}
__global__ void histogram_atomic(int *d_b, int *d_a)
{
int tid = threadIdx.x + blockDim.x * blockIdx.x;
int item = d_a[tid];
if (tid < SIZE)
{
atomicAdd(&(d_b[item]), 1);
}
}
int main()
{
int h_a[SIZE];
for (int i = 0; i < SIZE; i++) {
h_a[i] = i % NUM_BIN;
}
int h_b[NUM_BIN];
for (int i = 0; i < NUM_BIN; i++) {
h_b[i] = 0;
}
// declare GPU memory pointers
int * d_a;
int * d_b;
// allocate GPU memory
cudaMalloc((void **)&d_a, SIZE * sizeof(int));
cudaMalloc((void **)&d_b, NUM_BIN * sizeof(int));
// transfer the arrays to the GPU
cudaMemcpy(d_a, h_a, SIZE * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, NUM_BIN * sizeof(int), cudaMemcpyHostToDevice);
// launch the kernel
// histogram_without_atomic << <((SIZE+NUM_BIN-1) / NUM_BIN), NUM_BIN >> >(d_b, d_a);
//histogram_atomic << <((SIZE+NUM_BIN-1) / NUM_BIN), NUM_BIN >> >(d_b, d_a);
histogram_shared_memory << <((SIZE+NUM_BIN-1) / NUM_BIN), NUM_BIN >> >(d_b, d_a);
// copy back the sum from GPU
cudaMemcpy(h_b, d_b, NUM_BIN * sizeof(int), cudaMemcpyDeviceToHost);
printf("Histogram using 16 bin without shared Memory is: \n");
for (int i = 0; i < NUM_BIN; i++) {
printf("bin %d: count %d\n", i, h_b[i]);
}
// free GPU memory allocation
cudaFree(d_a);
cudaFree(d_b);
return 0;
}