Module 2
Vector Addition – Traditional C Code
// Compute vector sum C = A + B
void vecAdd(float *h_A, float *h_B, float *h_C, int n)
{
int i;
for (i = 0; i<n; i++) h_C[i] = h_A[i] + h_B[i];
}
int main()
{
vecAdd(h_A, h_B, h_C, N); }
vecAdd CUDA
Host Code
#include <cuda.h>
void vecAdd(float *h_A, float *h_B, float *h_C, int n) {
int size = n* sizeof(float);
float *d_A, *d_B, *d_C;
dim3 DimGrid((n-1)/256 + 1, 1, 1);
dim3 DimBlock(256, 1, 1);
vecAddKernel<<<DimGrid,DimBlock>>>(d_A, d_B, d_C, n);
// Part 1
// Allocate device memory for A, B, and C // copy A and B to device memory
// Part 2
// Kernel launch code – the device performs the actual vector addition
// Part 3
// copy C from the device memory // Free device vectors
cudaMalloc((void **) &d_A, size);
cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_B, size);
cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
cudaMalloc((void **) &d_C, size);
// Kernel invocation code – to be shown later
cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
cudaFree(d_A); cudaFree(d_B); cudaFree (d_C);
}
Device Code
// Compute vector sum C = A + B
// Each thread performs one pair-wise addition
__global__void vecAddKernel(float* A, float* B, float* C, int n){
int i = threadIdx.x+blockDim.x*blockIdx.x;
if(i<n) C[i] = A[i] + B[i];
}
Hello World! with Device Code
__global__ void mykernel(void) {
}
int main(void) {
mykernel<<<1,1>>>();
printf("Hello World!\n");
return 0;
}
Module 3 CUDA并行模型
Source Code of a PictureKernel
__global__ void PictureKernel(float* d_Pin, float* d_Pout, int height, int width)
{
// Calculate the row # of the d_Pin and d_Pout element
int Row = blockIdx.y*blockDim.y + threadIdx.y;
// Calculate the column # of the d_Pin and d_Pout element
int Col = blockIdx.x*blockDim.x + threadIdx.x;
// each thread computes one element of d_Pout if in range
if ((Row < height) && (Col < width)) {
d_Pout[Row*width+Col] = 2.0*d_Pin[Row*width+Col];
}
}
RGB转换到灰度图代码
#define CHANNELS 3
// we have 3 channels corresponding to RGB
// The input image is encoded as unsigned characters [0, 255]
__global__ void colorConvert(unsigned char * grayImage,int width, int height) {
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
if (x < width && y < height) {
// get 1D coordinate for the grayscale image
int grayOffset = y*width + x;
// one can think of the RGB image having
// CHANNEL times columns than the gray scale image
int rgbOffset = grayOffset*CHANNELS;
unsigned char r = rgbImage[rgbOffset ];// red value for pixel
unsigned char g = rgbImage[rgbOffset + 1]; // green value for pixel
unsigned char b = rgbImage[rgbOffset + 2]; // blue value for pixel
// perform the rescaling and store it
// We multiply by floating point constants
grayImage[grayOffset] = 0.21f*r + 0.71f*g + 0.07f*b;
}
}
Image Blur as a 2D Kernel(图像模糊)
__global__void blurKernel(unsigned char * in, unsigned char * out, int w, int h) {
int Col = blockIdx.x * blockDim.x + threadIdx.x;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
if (Col < w && Row < h) {
int pixVal = 0;
int pixels = 0;
// Get the average of the surrounding 2xBLUR_SIZE x 2xBLUR_SIZE box
for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE+1; ++blurRow) {
for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE+1; ++blurCol) {
int curRow = Row + blurRow;
int curCol = Col + blurCol;
// Verify we have a valid image pixel
if(curRow > -1 && curRow < h && curCol > -1 && curCol < w) {
pixVal += in[curRow * w + curCol];
pixels++; // Keep track of number of pixels in the accumulated total
}
}
}
// Write our new pixel value out
out[Row * w + Col] = (unsigned char)(pixVal / pixels);
}
}
Module 4
A Basic Matrix Multiplication
__global__ void MatrixMulKernel(float* M, float* N, float* P, int Width) {
// Calculate the row index of the P element and M
int Row = blockIdx.y*blockDim.y+threadIdx.y;
// Calculate the column index of P and N
int Col = blockIdx.x*blockDim.x+threadIdx.x;
if ((Row < Width) && (Col < Width)) {
float Pvalue = 0;
// each thread computes one element of the block sub-matrix
for (int k = 0; k < Width; ++k) {
Pvalue += M[Row*Width+k]*N[k*Width+Col];
}
P[Row*Width+Col] = Pvalue;
}
}
Tiled Matrix Multiplication Kernel
__global__ void MatrixMulKernel(float* M, float* N, float* P, Int Width) {
__shared__ float ds_M[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_N[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x; int by = blockIdx.y;
int tx = threadIdx.x; int ty = threadIdx.y;
int Row = by * blockDim.y + ty;
int Col = bx * blockDim.x + tx;
float Pvalue = 0;
// Loop over the M and N tiles required to compute the P element
for (int p = 0; p < n/TILE_WIDTH; ++p) {
// Collaborative loading of M and N tiles into shared memory
ds_M[ty][tx] = M[Row*Width + p*TILE_WIDTH+tx];
ds_N[ty][tx] = N[(p*TILE_WIDTH+ty)*Width + Col];
__syncthreads();
for (int i = 0; i < TILE_WIDTH; ++i)
Pvalue += ds_M[ty][i] * ds_N[i][tx];
__synchthreads();
}
P[Row*Width+Col] = Pvalue;
}
Loading Elements – with boundary check
for (int p = 0; p < (Width-1) / TILE_WIDTH + 1; ++p) {
if(Row < Width && p * TILE_WIDTH+tx < Width) {
ds_M[ty][tx] = M[Row * Width + p * TILE_WIDTH + tx];
}
else {
ds_M[ty][tx] = 0.0;
}
if (p*TILE_WIDTH+ty < Width && Col < Width) {
ds_N[ty][tx] = N[(p*TILE_WIDTH + ty) * Width + Col];
}
else {
ds_N[ty][tx] = 0.0;
}
__syncthreads();
Module 7 Histgram
A Basic Histogram Kernel
__global__ void histo_kernel(unsigned int *buffer,long size, unsigned int *histo)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
// stride is total number of threads
int stride = blockDim.x * gridDim.x;
// All threads handle blockDim.x * gridDim.x
// consecutive elements
while (i < size) {
atomicAdd(&(histo[buffer[i]]),1);
i += stride;
}
}
A Basic Text Histogram Kernel
__global__ void histo_kernel(unsigned char *buffer,long size, unsigned int *histo)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
// stride is total number of threads
int stride = blockDim.x * gridDim.x;
// All threads handle blockDim.x * gridDim.x
// consecutive elements
while (i < size) {
int alphabet_position = buffer[i] - "a";
if(alphabet_position >= 0 && alphabet_position < 26){
atomicAdd(&(histo[alphabet_position]),1);
i += stride;
}
}
}
A Basic Text Histogram Kernel(四个字母一个箱子)
__global__ void histo_kernel(unsigned char *buffer,long size, unsigned int *histo)
{
int i = threadIdx.x + blockIdx.x * blockDim.x;
// stride is total number of threads
int stride = blockDim.x * gridDim.x;
// All threads handle blockDim.x * gridDim.x
// consecutive elements
while (i < size) {
int alphabet_position = buffer[i] - "a";
if(alphabet_position >= 0 && alphabet_position < 26){
atomicAdd(&(histo[alphabet_position / 4]),1);
i += stride;
}
}
}
Shared Memory Atomics Requires Privatization
//Create private copies of the histo[] array for each thread block
__global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo)
{
__shared__ unsigned int histo_private[7];
if (threadIdx.x < 7)
histo_private[threadidx.x] = 0;
__syncthreads();
Build Private Histogram
int i = threadIdx.x + blockIdx.x * blockDim.x;
// stride is total number of threads
int stride = blockDim.x * gridDim.x;
while (i < size) {
atomicAdd( &(private_histo[buffer[i]/4), 1);
i += stride;
}
Build Final Histogram
// wait for all other threads in the block to finish
__syncthreads();
if (threadIdx.x < 7) {
atomicAdd(&(histo[threadIdx.x]), private_histo[threadIdx.x] );
}
}
Module 8 Stencil
具有边界条件处理的一维卷积核
__global__ void convolution_1D_basic_kernel(float *N, float *M, float *P, int Mask_Width, int Width) {
int i = blockIdx.x*blockDim.x + threadIdx.x;
float Pvalue = 0;
int N_start_point = i - (Mask_Width/2);
if (i < Width) {
for (int j = 0; j < Mask_Width; j++) {
if (N_start_point + j >= 0 && N_start_point + j < Width) {
Pvalue += N[N_start_point + j]*M[j];
}
}
P[i] = Pvalue;
}
}
具有边界条件处理的二维卷积核
__global__ void convolution_2D_basic_kernel(unsigned char * in, unsigned char * mask, unsigned char * out, int maskwidth, int w, int h) {
int Col = blockIdx.x * blockDim.x + threadIdx.x;
int Row = blockIdx.y * blockDim.y + threadIdx.y;
if (Col < w && Row < h) {
int pixVal = 0;
N_start_col = Col - (maskwidth/2);
N_start_row = Row - (maskwidth/2);
// Get the of the surrounding box
for(int j = 0; j < maskwidth; ++j) {
for(int k = 0; k < maskwidth; ++k) {
int curRow = N_Start_row + j;
int curCol = N_start_col + k;
// Verify we have a valid image pixel
if(curRow > -1 && curRow < h && curCol > -1 && curCol < w) {
pixVal += in[curRow * w + curCol] * mask[j*maskwidth+k];
}
}
}
// Write our new pixel value out
out[Row * w + Col] = (unsigned char)(pixVal);
}
}
分块卷积算法(1D)
All Threads Participate in Loading Input Tiles
float output = 0.0f;
if((index_i >= 0) && (index_i < Width)) {
Ns[tx] = N[index_i];
}
else{
Ns[tx] = 0.0f;
}
//N:global memory
//Ns:shared memory
Some threads do not participate in calculating output
#define O_TILE_WIDTH 1020
#define BLOCK_WIDTH (O_TILE_WIDTH + 4)
dim3 dimBlock(BLOCK_WIDTH,1, 1);
dim3 dimGrid((Width-1)/O_TILE_WIDTH+1, 1, 1)
setting block size
if (threadIdx.x < O_TILE_WIDTH){
output = 0.0f;
for(j = 0; j < Mask_Width; j++) {
output += M[j] * Ns[j+threadIdx.x];
}
P[index_o] = output;
}
1D convolution using shared memory
__global__ void convolution_1D_sm_st2(float *N, float *M, float *P) {
__shared__ float Ns[O_TILE_WIDTH + Mask_Width - 1];
float Pvalue;
int N_start_point;
int tx=threadIdx.x;
int idx_o = blockIdx.x*O_TILE_WIDTH + threadIdx.x;
int idx_i = idx_o - Mask_Width / 2; if ((idx_i >= 0) && (idx_i < Array_Width)){
Ns[tx] = N[idx_i];
}
__syncthreads();
// //仅仅 Threads 0 到 O_TILE_WIDTH - 1 参与计算输出.
if (tx< O_TILE_WIDTH) {
Pvalue = 0;
for (int j = 0; j < Mask_Width; j++) {
Pvalue += Ns[tx + j] * M[j];
}
P[idx_o] = Pvalue;
__syncthreads();
}
}
分块卷积算法(2D)
Image Matrix Type in this Course
// Image Matrix Structure declaration
//
typedef struct {
int width;
int height;
int pitch;
int channels;
float* data;
} * wbImage_t;
seting block size
#define O_TILE_WIDTH 12
#define BLOCK_WIDTH (O_TILE_WIDTH + 4)
dim3 dimBlock(BLOCK_WIDTH,BLOCK_WIDTH);
dim3 dimGrid((wbImage_getWidth(N)-1)/O_TILE_WIDTH+1(wbImage_getHeight(N)-1)/O_TILE_WIDTH+1,1)
Shifting from output coordinates to input coordinate
int tx = threadIdx.x;
int ty = threadIdx.y;
int row_o = blockIdx.y*O_TILE_WIDTH + ty;
int col_o = blockIdx.x*O_TILE_WIDTH + tx;
int row_i = row_o - 2;
int col_i = col_o - 2;
Taking Care of Boundaries (1 channel example)
if((row_i >= 0) && (row_i < height) &&
(col_i >= 0) && (col_i < width)) {
Ns[ty][tx] = data[row_i * width + col_i];
} else{
Ns[ty][tx] = 0.0f;
}
Some threads do not participate in calculating output. (1 channel example)
float output = 0.0f;
if(ty < O_TILE_WIDTH && tx < O_TILE_WIDTH){
for(i = 0; i < MASK_WIDTH; i++) {
for(j = 0; j < MASK_WIDTH; j++) {
output += M[i][j] * Ns[i+ty][j+tx];
}
}
Some threads do not write output (1 channel example)
if(row_o < height && col_o < width)
data[row_o*width + col_o] = output;
Module 9 Reduction
A Basic Reduction Kernel
//A Simple Thread Block Design
//Each thread block takes 2*BlockDim.x input elements
//Each thread loads 2 elements into shared memory
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int tx = threadIdx.x;
unsigned int start = 2*blockIdx.x*blockDim.x;
partialSum[tx] = input[start + tx];
partialSum[blockDim+tx] = input[start + blockDim.x+tx];
//The Reduction Steps
for (unsigned int stride = 1;stride <= blockDim.x; stride *= 2)
{
__syncthreads();
if (tx % stride == 0)
partialSum[2*tx]+= partialSum[2*tx+stride];
}
A Better Reduction Kernel
for (unsigned int stride = blockDim.x; stride > 0; stride /= 2)
{
__syncthreads();
if (t < stride)
partialSum[t] += partialSum[t+stride];
}
Module 10 Scan
A Work-Inefficient Inclusive Scan Kernel
__global__ void work_inefficient_scan_kernel(float *X, float *Y, int InputSize){
__shared__ float XY[SECTION_SIZE];
int i= blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize) {
XY[threadIdx.x] = X[i];
}
// the code below performs iterative scan on XY
for (unsigned int stride = 1; stride <= threadIdx.x; stride *= 2) {
__syncthreads();
float in1 = XY[threadIdx.x - stride];
__syncthreads();
XY[threadIdx.x] += in1;
}
__ syncthreads();
If (i < InputSize) {
Y[i] = XY[threadIdx.x];
}
}
A Work-Efficient Inclusive Scan Kernel
__global__ void work_efficient_scan_kernel(float *X, float *Y, int InputSize){
__shared__ float XY[SECTION_SIZE];
int i= blockIdx.x * blockDim.x + threadIdx.x;
if (i < InputSize) {
XY[threadIdx.x] = X[i];
}
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
int index = (threadIdx.x+1) * 2* stride -1;
if (index < SECTION_SIZE) {
XY[index] += XY[index - stride];
}
}
for (int stride = blockDim.x /2; stride > 0; stride /= 2) {
__syncthreads();
int index = (threadIdx.x+1) * stride * 2 -1;
if (index + stride < SECTION_SIZE) {
XY[index + stride] += XY[index];
}
}
__syncthreads();
Y[i] = XY[threadIdx.x];
Module 11 稀疏矩阵
串行SpMV/CSR
for (int row = 0; row < num_rows; row++) {
float dot = 0;
int row_start = row_ptr[row];
int row_end = row_ptr[row+1];
for (int elem = row_start; elem < row_end; elem++) {
dot += data[elem] * x[col_index[elem]];
}
y[row] += dot;
}
并行SpMV/CSR(核函数)
__global__ void SpMV_CSR(int num_rows, float *data, int *col_index, int *row_ptr, float *x,float *y) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
if (row < num_rows) {
float dot = 0;
int row_start = row_ptr[row];
int row_end = row_ptr[row+1];
for (int elem = row_start; elem < row_end; elem++) {
dot += data[elem] * x[col_index[elem]];
}
y[row] += dot;
}
}
并行SpMV/ELL(核函数)
__global__ void SpMV_ELL(int num_rows, float *data, int *col_index, int num_elem, float *x, float *y)
{
int row = blockIdx.x * blockDim.x + threadIdx.x;
int col;
if (row < num_rows) {
float dot = 0;
for (int i = 0; i < num_elem; i++) {
col = col_index[row+i*num_rows];
dot += data[row+i*num_rows] * x[col];
}
y[row] += dot;
}
}
SpMV kernel using ELL-COO hybrid format
__global__ void SpMV_Hybrid(int num_rows, float *data_ell, int *col_index_ell, int num_elem_ell, float *data_coo, int *row_index_coo, int *col_index_coo, int num_elem_coo, float *x, float *y) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
int i;
if (row < num_rows) {
float dot = 0;
// Process using ELL
for (i = 0; i < num_elem_ell; i++) {
// Assuming -1 is used to indicate no entry
int col = col_index_ell[row + i * num_rows];
if (col != -1) {
// Assuming -1 is used to indicate no entry
dot += data_ell[row+i*num_rows] * x[col];
}
}
// Process using COO
for (i = 0; i < num_elem_coo; i++) {
if (row_index_coo[i] == row) {
dot += data_coo[i] * x[col_index_coo[i]];
}
}
}
y[row] += dot;
}
SpMV kernel using JDS format
__global__ void spmv_jds (
float * data, // Non-zero values in JDS format
int * col_index, // Column indices in JDS format
int*row_perm, //Rowpermutationarray
int * row_nnz, // The number of non-zero elements per row
float* x, //Input vector x
float* y,//Output vector y
int num_rows// Number of rows in the matrix
){
int row = blockDim.x * blockIdx.x + threadIdx.x;
if (row < num_rows) {
float dot = 0.0f;
int row_start = 0;
for (int j = 0; j < row; j++) {
row_start += row_nnz[j];
}
int row_end = row_start + row_nnz[row];
for (int j = row_start; j < row_end; j++) {
dot += data[j] * x[col_index[j]];
}
y[row_perm[row]] = dot;
}
}
代码来自PPT,仅供期末复习参考