3 线程、同步、存储器
3.1 线程与存储
- tid=blockIdx.x(当前块的ID)*blockDim.x(当前块里面的线程数量)+threadIdx.x(当前线程在块中的ID)
- gridDim.x*blockDim.x来计算,前者代表了本次启动的块的数量,而后者代表了每个块里面的线程数量,然后每次while循环,tid变量加上这个值,向后偏移以得到下个任务的索引
- 所有线程都有一个寄存器堆,它是最快的。
- 共享内存只能被块中的线程访问,但比全局内存块。
- 全局内存是最慢的,但可以被所有的块访问。
- 常量和纹理内存用于特殊用途
- 所有通过cudaMalloc分配的存储器都是全局内存
- 本地内存和寄存器堆对每个线程都是唯一的。
- 寄存器是每个线程可用的最快存储器。
- 当内核中使用的变量在寄存器堆中装不下的时候,将会使用本地内存存储它们,这叫寄存器溢出
- “读取旧值-累加-回写新值”操作是不可被其他线程扰乱的原子性的整体完成的。
- 使用atomicAdd进行原子累加的内核函数
- 使用原子操作后程序具有更大的执行代价。可以通过使用共享内存来加速这些原子累加操作
GPU卡从逻辑上对用户提供了64KB的常量内存空间,可以用来存储内核执行期间所需要的恒定数据 - 常量内存有助于节省全局内存的访问带宽
- warp整体进行一次常量内存的读取,结果广播给warp里的32个线程。同时,常量内存具有cache缓冲。当后续的在邻近位置上访问,将不会发生额外的从显存过来的传输。每个warp里的32个线程,进行一致性的相同常量内存位置读取的时候,这种广播效果和cache命中效果可以节省执行时间
- 当程序进行具有很大程度上的空间邻近性的访存的时候,纹理变得非常高效。空间邻近性的意思是,每个线程的读取位置都和其他线程的读取位置邻近。
- 请一定要确保纹理引用被定义成全局静态变量,同时还要确保它不能作为参数传递给任何其他函数。
- 原子操作-求和
#include <stdio.h>
#define NUM_THREADS 10000
#define SIZE 10
#define BLOCK_WIDTH 100
__global__ void gpu_increment_without_atomic(int *d_a)
{
// Calculate thread id for current thread
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// each thread increments elements wrapping at SIZE variable
tid = tid % SIZE;
d_a[tid] += 1;
}
__global__ void gpu_increment_atomic(int *d_a)
{
// Calculate thread id for current thread
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// each thread increments elements wrapping at SIZE variable
tid = tid % SIZE;
atomicAdd(&d_a[tid], 1);
}
int main(int argc, char **argv)
{
printf("%d total threads in %d blocks writing into %d array elements\n",
NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, SIZE);
// declare and allocate host memory
int h_a[SIZE];
const int ARRAY_BYTES = SIZE * sizeof(int);
// declare and allocate GPU memory
int *d_a, *d_aA;
cudaMalloc((void **)&d_a, ARRAY_BYTES);
//Initialize GPU memory to zero
cudaMemset((void *)d_a, 0, ARRAY_BYTES);
gpu_increment_without_atomic << <NUM_THREADS / BLOCK_WIDTH, BLOCK_WIDTH >> >(d_a);
// copy back the array to host memory
cudaMemcpy(h_a, d_a, ARRAY_BYTES, cudaMemcpyDeviceToHost);
printf("Number of times a particular Array index has been incremented without atomic add is: \n");
for (int i = 0; i < SIZE; i++)
{
printf("index: %d --> %d times\n ", i, h_a[i]);
}
cudaFree(d_a);
cudaMalloc((void **)&d_aA, ARRAY_BYTES);
//Initialize GPU memory to zero
cudaMemset((void *)d_aA, 0, ARRAY_BYTES);
gpu_increment_atomic << <NUM_THREADS / BLOCK_WIDTH, BLOCK_WIDTH >> >(d_aA);
// copy back the array to host memory
cudaMemcpy(h_a, d_aA, ARRAY_BYTES, cudaMemcpyDeviceToHost);
printf("Number of times a particular Array index has been incremented is: \n");
for (int i = 0; i < SIZE; i++)
{
printf("index: %d --> %d times\n ", i, h_a[i]);
}
cudaFree(d_aA);
return 0;
}
- 常量内存与纹理内存
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining two constants
__constant__ int constant_f;
__constant__ int constant_g;
#define N 5
//Kernel function for using constant memory
__global__ void gpu_constant_memory(float *d_in, float *d_out) {
//Thread index for current kernel
int tid = threadIdx.x;
d_out[tid] = constant_f*d_in[tid] + constant_g;
}
#define NUM_THREADS 5
texture <float, 1, cudaReadModeElementType> textureRef;
__global__ void gpu_texture_memory(int n, float *d_out)
{
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx < n) {
float temp = tex1D(textureRef, float(idx));
d_out[idx] = temp;
}
}
int main(void) {
//Defining Arrays for host
float h_in[N], h_out[N];
//Defining Pointers for device
float *d_in, *d_out;
// 常量内存
int h_f = 2;
int h_g = 20;
// allocate the memory on the cpu
cudaMalloc((void**)&d_in, N * sizeof(float));
cudaMalloc((void**)&d_out, N * sizeof(float));
//Initializing Array
for (int i = 0; i < N; i++) {
h_in[i] = i;
}
//Copy Array from host to device
cudaMemcpy(d_in, h_in, N * sizeof(float), cudaMemcpyHostToDevice);
//Copy constants to constant memory
cudaMemcpyToSymbol(constant_f, &h_f, sizeof(int),0,cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(constant_g, &h_g, sizeof(int));
//Calling kernel with one block and N threads per block
gpu_constant_memory << <1, N >> >(d_in, d_out);
//Coping result back to host from device memory
cudaMemcpy(h_out, d_out, N * sizeof(float), cudaMemcpyDeviceToHost);
//Printing result on console
printf("Use of Constant memory on GPU \n");
for (int i = 0; i < N; i++) {
printf("The expression for input %f is %f\n", h_in[i], h_out[i]);
}
//Free up memory
cudaFree(d_in);
cudaFree(d_out);
// 纹理内存
//Calculate number of blocks to launch
int num_blocks = N / NUM_THREADS + ((N % NUM_THREADS) ? 1 : 0);
//Declare device pointer
float *d_outM;
// allocate space on the device for the result
cudaMalloc((void**)&d_outM, sizeof(float) * N);
// allocate space on the host for the results
float *h_outM = (float*)malloc(sizeof(float)*N);
//Declare and initialize host array
float h_inM[N];
for (int i = 0; i < N; i++) {
h_inM[i] = float(i);
}
//Define CUDA Array
cudaArray *cu_Array;
cudaMallocArray(&cu_Array, &textureRef.channelDesc, N, 1);
//Copy data to CUDA Array
cudaMemcpyToArray(cu_Array, 0, 0, h_inM, sizeof(float)*N, cudaMemcpyHostToDevice);
// bind a texture to the CUDA array
cudaBindTextureToArray(textureRef, cu_Array);
//Call Kernel
gpu_texture_memory << <num_blocks, NUM_THREADS >> >(N, d_outM);
// copy result back to host
cudaMemcpy(h_outM, d_outM, sizeof(float)*N, cudaMemcpyDeviceToHost);
printf("Use of Texture memory on GPU: \n");
for (int i = 0; i < N; i++) {
printf("Texture element at %d is : %f\n",i, h_outM[i]);
}
free(h_outM);
cudaFree(d_outM);
cudaFreeArray(cu_Array);
cudaUnbindTexture(textureRef);
return 0;
}
- 全局、局部、共享内存
#include <stdio.h>
#define N 5
__global__ void gpu_global_memory(float *d_a)
{
// "array" is a pointer into global memory on the device
d_a[threadIdx.x] = threadIdx.x;
}
__global__ void gpu_local_memory(float d_in)
{
int t_local;
t_local = d_in * threadIdx.x;
printf("Value of Local variable in current thread is: %d \n", t_local);
}
__global__ void gpu_shared_memory(float *d_a)
{
// Defining local variables which are private to each thread
int i, index = threadIdx.x;
float average, sum = 0.0f;
//Define shared memory
__shared__ float sh_arr[5];
sh_arr[index] = d_a[index];
__syncthreads(); // This ensures all the writes to shared memory have completed
for (i = 0; i<= index; i++)
{
sum += sh_arr[i];
}
average = sum / (index + 1.0f);
d_a[index] = average;
sh_arr[index] = average;
}
int main(int argc, char **argv)
{
// Define Host Array
float h_a[N];
//Define device pointer
float *d_a;
// 全局内存
cudaMalloc((void **)&d_a, sizeof(float) *N);
// now copy data from host memory to device memory
cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) *N, cudaMemcpyHostToDevice);
// launch the kernel
gpu_global_memory << <1, N >> >(d_a);
// copy the modified array back to the host memory
cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) *N, cudaMemcpyDeviceToHost);
printf("Array in Global Memory is: \n");
//Printing result on console
for (int i = 0; i < N; i++) {
printf("At Index: %d --> %f \n", i, h_a[i]);
}
// 本地内存
printf("Use of Local Memory on GPU:\n");
gpu_local_memory << <1, N >> >(5);
cudaDeviceSynchronize();
// 共享内存
for (int i = 0; i < 5; i++) {
h_a[i] = i;
}
// allocate global memory on the device
cudaMalloc((void **)&d_a, sizeof(float) * 5);
// now copy data from host memory to device memory
cudaMemcpy((void *)d_a, (void *)h_a, sizeof(float) * 5, cudaMemcpyHostToDevice);
gpu_shared_memory << <1, 5 >> >(d_a);
// copy the modified array back to the host memory
cudaMemcpy((void *)h_a, (void *)d_a, sizeof(float) * 5, cudaMemcpyDeviceToHost);
printf("Use of Shared Memory on GPU: \n");
//Printing result on console
for (int i = 0; i < 5; i++) {
printf("The running average after %d element is %f \n", i, h_a[i]);
}
return 0;
}
- 多线程
#include "stdio.h"
#include<iostream>
#include <cuda.h>
#include <cuda_runtime.h>
//Defining number of elements in Array
#define N 50000
//Defining Kernel function for vector addition
__global__ void gpuAdd(int *d_a, int *d_b, int *d_c) {
//Getting block index of current kernel
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while (tid < N)
{
d_c[tid] = d_a[tid] + d_b[tid];
tid += blockDim.x * gridDim.x;
}
}
int main(void) {
//Defining host arrays
int h_a[N], h_b[N], h_c[N];
//Defining device pointers
int *d_a, *d_b, *d_c;
// allocate the memory
cudaMalloc((void**)&d_a, N * sizeof(int));
cudaMalloc((void**)&d_b, N * sizeof(int));
cudaMalloc((void**)&d_c, N * sizeof(int));
//Initializing Arrays
for (int i = 0; i < N; i++) {
h_a[i] = 2 * i*i;
h_b[i] = i;
}
// Copy input arrays from host to device memory
cudaMemcpy(d_a, h_a, N * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(int), cudaMemcpyHostToDevice);
//Calling kernels with N blocks and one thread per block, passing device pointers as parameters
gpuAdd << <512, 512 >> >(d_a, d_b, d_c);
//Copy result back to host memory from device memory
cudaMemcpy(h_c, d_c, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
int Correct = 1;
printf("Vector addition on GPU \n");
//Printing result on console
for (int i = 0; i < N; i++) {
if ((h_a[i] + h_b[i] != h_c[i]))
{
Correct = 0;
}
}
if (Correct == 1)
{
printf("GPU has computed Sum Correctly\n");
}
else
{
printf("There is an Error in GPU Computation\n");
}
//Free up memory
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}
3.2 向量点乘与矩阵乘法例子
- 向量点乘
- 矩阵乘法
#include "stdio.h"
#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <math.h>
#define TILE_SIZE 2
#define N 1024
#define threadsPerBlock 512
__global__ void gpu_dot(float *d_a, float *d_b, float *d_c) {
//Declare shared memory
__shared__ float partial_sum[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
//Calculate index for shared memory
int index = threadIdx.x;
//Calculate Partial Sum
float sum = 0;
while (tid < N)
{
sum += d_a[tid] * d_b[tid];
tid += blockDim.x * gridDim.x; // 前者代表了本次启动的块的数量,而后者代表了每个块里面的线程数量
}
// Store partial sum in shared memory
partial_sum[index] = sum;
// synchronize threads
__syncthreads();
// Calculating partial sum for whole block in reduce operation
int i = blockDim.x / 2;
while (i != 0) {
if (index < i)
partial_sum[index] += partial_sum[index + i];
__syncthreads();
i /= 2;
}
//Store block partial sum in global memory
if (index == 0)
d_c[blockIdx.x] = partial_sum[0];
}
//Matrix multiplication using non shared kernel
__global__ void gpu_Matrix_Mul_nonshared(float *d_a, float *d_b, float *d_c, const int size)
{
int row, col;
col = TILE_SIZE * blockIdx.x + threadIdx.x;
row = TILE_SIZE * blockIdx.y + threadIdx.y;
for (int k = 0; k< size; k++)
{
d_c[row*size + col] += d_a[row * size + k] * d_b[k * size + col];
}
}
// Matrix multiplication using shared kernel
__global__ void gpu_Matrix_Mul_shared(float *d_a, float *d_b, float *d_c, const int size)
{
int row, col;
//Defining Shared Memory
__shared__ float shared_a[TILE_SIZE][TILE_SIZE];
__shared__ float shared_b[TILE_SIZE][TILE_SIZE];
col = TILE_SIZE * blockIdx.x + threadIdx.x;
row = TILE_SIZE * blockIdx.y + threadIdx.y;
for (int i = 0; i< size / TILE_SIZE; i++)
{
shared_a[threadIdx.y][threadIdx.x] = d_a[row* size + (i*TILE_SIZE + threadIdx.x)];
shared_b[threadIdx.y][threadIdx.x] = d_b[(i*TILE_SIZE + threadIdx.y) * size + col];
__syncthreads();
for (int j = 0; j<TILE_SIZE; j++)
d_c[row*size + col] += shared_a[threadIdx.y][j] * shared_b[j][threadIdx.x];
__syncthreads();
}
}
int main_dot(void) {
//Declare Host Array
float *h_a, *h_b, h_c, *partial_sum;
//Declare device Array
float *d_a, *d_b, *d_partial_sum;
//Calculate total number of blocks per grid
int block_calc = (N + threadsPerBlock - 1) / threadsPerBlock;
int blocksPerGrid = (32 < block_calc ? 32 : block_calc);
// allocate memory on the host side
h_a = (float*)malloc(N * sizeof(float));
h_b = (float*)malloc(N * sizeof(float));
partial_sum = (float*)malloc(blocksPerGrid * sizeof(float));
// allocate the memory on the device
cudaMalloc((void**)&d_a, N * sizeof(float));
cudaMalloc((void**)&d_b, N * sizeof(float));
cudaMalloc((void**)&d_partial_sum, blocksPerGrid * sizeof(float));
// fill the host array with data
for (int i = 0; i<N; i++) {
h_a[i] = i;
h_b[i] = 2;
}
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
//Call kernel
gpu_dot << <blocksPerGrid, threadsPerBlock >> >(d_a, d_b, d_partial_sum);
// copy the array back to host memory
cudaMemcpy(partial_sum, d_partial_sum, blocksPerGrid * sizeof(float), cudaMemcpyDeviceToHost);
// Calculate final dot product on host
h_c = 0;
for (int i = 0; i<blocksPerGrid; i++) {
h_c += partial_sum[i];
}
printf("The computed dot product is: %f\n", h_c);
#define cpu_sum(x) (x*(x+1))
if (h_c == cpu_sum((float)(N - 1)))
{
printf("The dot product computed by GPU is correct\n");
}
else
{
printf("Error in dot product computation");
}
// free memory on host and device
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_partial_sum);
free(h_a);
free(h_b);
free(partial_sum);
return 0;
}
int main_matrix()
{
const int size = 4;
//Define Host Array
float h_a[size][size], h_b[size][size],h_result[size][size];
//Defining device Array
float *d_a, *d_b, *d_result;
//Initialize host Array
for (int i = 0; i<size; i++)
{
for (int j = 0; j<size; j++)
{
h_a[i][j] = i;
h_b[i][j] = j;
}
}
cudaMalloc((void **)&d_a, size*size*sizeof(int));
cudaMalloc((void **)&d_b, size*size * sizeof(int));
cudaMalloc((void **)&d_result, size*size* sizeof(int));
//copy host array to device array
cudaMemcpy(d_a, h_a, size*size* sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, size*size* sizeof(int), cudaMemcpyHostToDevice);
//Define grid and block dimensions
dim3 dimGrid(size / TILE_SIZE, size / TILE_SIZE, 1);
dim3 dimBlock(TILE_SIZE, TILE_SIZE, 1);
//gpu_Matrix_Mul_nonshared << <dimGrid, dimBlock >> > (d_a, d_b, d_result, size);
gpu_Matrix_Mul_shared << <dimGrid, dimBlock >> > (d_a, d_b, d_result, size);
cudaMemcpy(h_result, d_result, size*size * sizeof(int), cudaMemcpyDeviceToHost);
printf("The result of Matrix multiplication is: \n");
for (int i = 0; i< size; i++)
{
for (int j = 0; j < size; j++)
{
printf("%f ", h_result[i][j]);
}
printf("\n");
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_result);
return 0;
}
int main(){
main_matrix();
}