CUDA C 代码

Koiwan

已于 2024-01-05 11:14:42 修改

阅读量490

点赞数 8

文章标签：开发语言

于 2023-12-30 23:22:44 首次发布

本文链接：https://blog.csdn.net/qq_64381400/article/details/135310944

版权

Module 2

Vector Addition – Traditional C Code

// Compute vector sum C = A + B
void vecAdd(float *h_A, float *h_B, float *h_C, int n)
{
int i;
    for (i = 0; i<n; i++) h_C[i] = h_A[i] + h_B[i];
}
int main()
{
vecAdd(h_A, h_B, h_C, N); }

vecAdd CUDA

Host Code

#include <cuda.h>
void vecAdd(float *h_A, float *h_B, float *h_C, int n) {
    int size = n* sizeof(float);
    float *d_A, *d_B, *d_C;
    dim3 DimGrid((n-1)/256 + 1, 1, 1);
    dim3 DimBlock(256, 1, 1);
    vecAddKernel<<<DimGrid,DimBlock>>>(d_A, d_B, d_C, n);
// Part 1
// Allocate device memory for A, B, and C // copy A and B to device memory
// Part 2
// Kernel launch code – the device performs the actual vector addition
// Part 3
// copy C from the device memory // Free device vectors
    cudaMalloc((void **) &d_A, size);
    cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_B, size);
    cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice);
    cudaMalloc((void **) &d_C, size);
// Kernel invocation code – to be shown later
    cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost);
    cudaFree(d_A); cudaFree(d_B); cudaFree (d_C); 
}

Device Code

// Compute vector sum C = A + B
// Each thread performs one pair-wise addition
__global__void vecAddKernel(float* A, float* B, float* C, int n){
    int i = threadIdx.x+blockDim.x*blockIdx.x;
    if(i<n) C[i] = A[i] + B[i];
}

Hello World! with Device Code

__global__ void mykernel(void) {
    }
    int main(void) {
        mykernel<<<1,1>>>();
        printf("Hello World!\n");
        return 0;
}

Module 3 CUDA并行模型

Source Code of a PictureKernel

__global__ void PictureKernel(float* d_Pin, float* d_Pout, int height, int width)
{
// Calculate the row # of the d_Pin and d_Pout element 
    int Row = blockIdx.y*blockDim.y + threadIdx.y;
// Calculate the column # of the d_Pin and d_Pout element
    int Col = blockIdx.x*blockDim.x + threadIdx.x;
// each thread computes one element of d_Pout if in range 
    if ((Row < height) && (Col < width)) {
        d_Pout[Row*width+Col] = 2.0*d_Pin[Row*width+Col]; 
    }
}

RGB转换到灰度图代码

#define CHANNELS 3 
// we have 3 channels corresponding to RGB 
// The input image is encoded as unsigned characters [0, 255] 
__global__ void colorConvert(unsigned char * grayImage,int width, int height) {
    int x = threadIdx.x + blockIdx.x * blockDim.x; 
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    if (x < width && y < height) {
        // get 1D coordinate for the grayscale image
        int grayOffset = y*width + x;
        // one can think of the RGB image having
        // CHANNEL times columns than the gray scale image
        int rgbOffset = grayOffset*CHANNELS;
        unsigned char r = rgbImage[rgbOffset ];// red value for pixel 
        unsigned char g = rgbImage[rgbOffset + 1]; // green value for pixel  
        unsigned char b = rgbImage[rgbOffset + 2]; // blue value for pixel 
        // perform the rescaling and store it
        // We multiply by floating point constants
        grayImage[grayOffset] = 0.21f*r + 0.71f*g + 0.07f*b;
    } 
}

Image Blur as a 2D Kernel（图像模糊）

__global__void blurKernel(unsigned char * in, unsigned char * out, int w, int h) {
    int Col = blockIdx.x * blockDim.x + threadIdx.x;
    int Row = blockIdx.y * blockDim.y + threadIdx.y;
    if (Col < w && Row < h) { 
        int pixVal = 0;
        int pixels = 0;
        // Get the average of the surrounding 2xBLUR_SIZE x 2xBLUR_SIZE box
        for(int blurRow = -BLUR_SIZE; blurRow < BLUR_SIZE+1; ++blurRow) { 
            for(int blurCol = -BLUR_SIZE; blurCol < BLUR_SIZE+1; ++blurCol) {
                int curRow = Row + blurRow;
                int curCol = Col + blurCol;
                // Verify we have a valid image pixel
                if(curRow > -1 && curRow < h && curCol > -1 && curCol < w) {
                    pixVal += in[curRow * w + curCol];
                    pixels++; // Keep track of number of pixels in the accumulated total 
                }
            } 
        }
// Write our new pixel value out
out[Row * w + Col] = (unsigned char)(pixVal / pixels); 
    }
}

Module 4

A Basic Matrix Multiplication

__global__ void MatrixMulKernel(float* M, float* N, float* P, int Width) {
    // Calculate the row index of the P element and M 
    int Row = blockIdx.y*blockDim.y+threadIdx.y;
    // Calculate the column index of P and N
    int Col = blockIdx.x*blockDim.x+threadIdx.x;
    if ((Row < Width) && (Col < Width)) {
        float Pvalue = 0;
        // each thread computes one element of the block sub-matrix 
            for (int k = 0; k < Width; ++k) {
                Pvalue += M[Row*Width+k]*N[k*Width+Col]; 
            }
            P[Row*Width+Col] = Pvalue; 
    }
}

Tiled Matrix Multiplication Kernel

__global__ void MatrixMulKernel(float* M, float* N, float* P, Int Width) {
    __shared__ float ds_M[TILE_WIDTH][TILE_WIDTH]; 
    __shared__ float ds_N[TILE_WIDTH][TILE_WIDTH];
    int bx = blockIdx.x; int by = blockIdx.y; 
    int tx = threadIdx.x; int ty = threadIdx.y;
    int Row = by * blockDim.y + ty; 
    int Col = bx * blockDim.x + tx; 
    float Pvalue = 0;
    // Loop over the M and N tiles required to compute the P element 
    for (int p = 0; p < n/TILE_WIDTH; ++p) {
        // Collaborative loading of M and N tiles into shared memory 
        ds_M[ty][tx] = M[Row*Width + p*TILE_WIDTH+tx];
        ds_N[ty][tx] = N[(p*TILE_WIDTH+ty)*Width + Col]; 
        __syncthreads();
        for (int i = 0; i < TILE_WIDTH; ++i)
            Pvalue += ds_M[ty][i] * ds_N[i][tx];
        __synchthreads();
    }
    P[Row*Width+Col] = Pvalue; 
}

Loading Elements – with boundary check

for (int p = 0; p < (Width-1) / TILE_WIDTH + 1; ++p) {
    if(Row < Width && p * TILE_WIDTH+tx < Width) { 
        ds_M[ty][tx] = M[Row * Width + p * TILE_WIDTH + tx];
    } 
    else {
        ds_M[ty][tx] = 0.0;
    }
    if (p*TILE_WIDTH+ty < Width && Col < Width) {
        ds_N[ty][tx] = N[(p*TILE_WIDTH + ty) * Width + Col];
    }
    else {
        ds_N[ty][tx] = 0.0; 
    }
__syncthreads();

Module 7 Histgram

A Basic Histogram Kernel


__global__ void histo_kernel(unsigned int *buffer,long size, unsigned int *histo)
{
     int i = threadIdx.x + blockIdx.x * blockDim.x;
     // stride is total number of threads
     int stride = blockDim.x * gridDim.x;
     // All threads handle blockDim.x * gridDim.x
    // consecutive elements
    while (i < size) {
        atomicAdd(&(histo[buffer[i]]),1);
            i += stride;
    }
}

A Basic Text Histogram Kernel

__global__ void histo_kernel(unsigned char *buffer,long size, unsigned int *histo)
{
     int i = threadIdx.x + blockIdx.x * blockDim.x;
     // stride is total number of threads
     int stride = blockDim.x * gridDim.x;
     // All threads handle blockDim.x * gridDim.x
    // consecutive elements
    while (i < size) {
        int alphabet_position = buffer[i] - "a";
        if(alphabet_position >= 0 && alphabet_position < 26){
            atomicAdd(&(histo[alphabet_position]),1);
            i += stride;
        }
    }
}

A Basic Text Histogram Kernel(四个字母一个箱子）


__global__ void histo_kernel(unsigned char *buffer,long size, unsigned int *histo)
{
     int i = threadIdx.x + blockIdx.x * blockDim.x;
     // stride is total number of threads
     int stride = blockDim.x * gridDim.x;
     // All threads handle blockDim.x * gridDim.x
    // consecutive elements
    while (i < size) {
        int alphabet_position = buffer[i] - "a";
        if(alphabet_position >= 0 && alphabet_position < 26){
            atomicAdd(&(histo[alphabet_position / 4]),1);
            i += stride;
        }
    }
}

Shared Memory Atomics Requires Privatization

//Create private copies of the histo[] array for each thread block
__global__ void histo_kernel(unsigned char *buffer, long size, unsigned int *histo)
{
    __shared__ unsigned int histo_private[7];
    if (threadIdx.x < 7) 
        histo_private[threadidx.x] = 0;
     __syncthreads();

Build Private Histogram

int i = threadIdx.x + blockIdx.x * blockDim.x; 
// stride is total number of threads
int stride = blockDim.x * gridDim.x; 
while (i < size) {
    atomicAdd( &(private_histo[buffer[i]/4), 1);
    i += stride;
}

Build Final Histogram

// wait for all other threads in the block to finish 
__syncthreads();
if (threadIdx.x < 7) {
    atomicAdd(&(histo[threadIdx.x]), private_histo[threadIdx.x] );
} 
}

Module 8 Stencil

具有边界条件处理的一维卷积核

__global__ void convolution_1D_basic_kernel(float *N, float *M, float *P, int Mask_Width, int Width) {
	int i = blockIdx.x*blockDim.x + threadIdx.x;
    
	float Pvalue = 0;
	int N_start_point = i - (Mask_Width/2);
	if (i < Width) {
		for (int j = 0; j < Mask_Width; j++) {
			if (N_start_point + j >= 0 && N_start_point + j < Width) {
				Pvalue += N[N_start_point + j]*M[j];
			}
		}
		P[i] = Pvalue;
    }
}

具有边界条件处理的二维卷积核

__global__ void convolution_2D_basic_kernel(unsigned char * in, unsigned char * mask, unsigned char * out, int maskwidth, int w, int h) {
	int Col  = blockIdx.x * blockDim.x + threadIdx.x;
	int Row  = blockIdx.y * blockDim.y + threadIdx.y;
    
	if (Col < w && Row < h) {
		int pixVal = 0;
        
		N_start_col = Col - (maskwidth/2);
		N_start_row = Row - (maskwidth/2);
		// Get the of the surrounding box
		for(int j = 0; j < maskwidth; ++j) {
			for(int k = 0; k < maskwidth; ++k) {
				int curRow = N_Start_row + j;
				int curCol = N_start_col + k;
				// Verify we have a valid image pixel
				if(curRow > -1 && curRow < h && curCol > -1 && curCol < w) {
					pixVal += in[curRow * w + curCol] * mask[j*maskwidth+k];
				}
			}
		}
		// Write our new pixel value out
		out[Row * w + Col] = (unsigned char)(pixVal);
	}
}

分块卷积算法(1D)

All Threads Participate in Loading Input Tiles

float output = 0.0f;
     if((index_i >= 0) && (index_i < Width)) {
       Ns[tx] = N[index_i];
}
else{
       Ns[tx] = 0.0f;
     }
//N:global memory
//Ns:shared memory

Some threads do not participate in calculating output

#define O_TILE_WIDTH 1020
#define BLOCK_WIDTH (O_TILE_WIDTH + 4)
dim3 dimBlock(BLOCK_WIDTH,1, 1); 
dim3 dimGrid((Width-1)/O_TILE_WIDTH+1, 1, 1)

setting block size

if (threadIdx.x < O_TILE_WIDTH){ 
    output = 0.0f; 
    for(j = 0; j < Mask_Width; j++) {
        output += M[j] * Ns[j+threadIdx.x];
        }
    P[index_o] = output;
}

1D convolution using shared memory

__global__ void convolution_1D_sm_st2(float *N, float *M, float *P) {
    __shared__ float Ns[O_TILE_WIDTH + Mask_Width - 1]; 
    float Pvalue;
    int N_start_point;
    int tx=threadIdx.x;
    int idx_o = blockIdx.x*O_TILE_WIDTH + threadIdx.x; 
    int idx_i = idx_o - Mask_Width / 2; if ((idx_i >= 0) && (idx_i < Array_Width)){
        Ns[tx] = N[idx_i];
    }
    __syncthreads();
// //仅仅 Threads 0 到 O_TILE_WIDTH - 1 参与计算输出. 
    if (tx< O_TILE_WIDTH) {
        Pvalue = 0;
        for (int j = 0; j < Mask_Width; j++) {
            Pvalue += Ns[tx + j] * M[j];
        }
        P[idx_o] = Pvalue;
        __syncthreads();
    }
}

分块卷积算法(2D)

Image Matrix Type in this Course

   // Image Matrix Structure declaration
   //
   typedef struct {
      int width;
      int height;
      int pitch;
      int channels;
      float* data;
   } * wbImage_t;

seting block size

#define O_TILE_WIDTH 12 
#define BLOCK_WIDTH (O_TILE_WIDTH + 4)
dim3 dimBlock(BLOCK_WIDTH,BLOCK_WIDTH);
dim3 dimGrid((wbImage_getWidth(N)-1)/O_TILE_WIDTH+1(wbImage_getHeight(N)-1)/O_TILE_WIDTH+1,1)

Shifting from output coordinates to input coordinate

int tx = threadIdx.x;
int ty = threadIdx.y;
int row_o = blockIdx.y*O_TILE_WIDTH + ty;
int col_o = blockIdx.x*O_TILE_WIDTH + tx;
int row_i = row_o - 2;
int col_i = col_o - 2;

Taking Care of Boundaries (1 channel example)

   if((row_i >= 0) && (row_i < height) &&
      (col_i >= 0)  && (col_i < width)) {
     Ns[ty][tx] = data[row_i * width + col_i];
   } else{
     Ns[ty][tx] = 0.0f;
   }

Some threads do not participate in calculating output. (1 channel example)

   float output = 0.0f;
   if(ty < O_TILE_WIDTH && tx < O_TILE_WIDTH){
      for(i = 0; i < MASK_WIDTH; i++) {
        for(j = 0; j < MASK_WIDTH; j++) {
          output += M[i][j] * Ns[i+ty][j+tx];
      }
}

Some threads do not write output (1 channel example)

   if(row_o < height && col_o < width)
     data[row_o*width + col_o] = output;

Module 9 Reduction

A Basic Reduction Kernel

//A Simple Thread Block Design
//Each thread block takes 2*BlockDim.x input elements 
//Each thread loads 2 elements into shared memory
__shared__ float partialSum[2*BLOCK_SIZE];
unsigned int tx = threadIdx.x; 
unsigned int start = 2*blockIdx.x*blockDim.x; 
partialSum[tx] = input[start + tx]; 
partialSum[blockDim+tx] = input[start + blockDim.x+tx];

//The Reduction Steps
for (unsigned int stride = 1;stride <= blockDim.x;  stride *= 2)
{
     __syncthreads();
     if (tx % stride == 0)
        partialSum[2*tx]+= partialSum[2*tx+stride]; 
}

A Better Reduction Kernel

for (unsigned int stride = blockDim.x; stride > 0; stride /= 2)
{
    __syncthreads();
    if (t < stride)
        partialSum[t] += partialSum[t+stride]; 
}

Module 10 Scan

A Work-Inefficient Inclusive Scan Kernel

__global__ void work_inefficient_scan_kernel(float *X, float *Y, int InputSize){
     __shared__ float XY[SECTION_SIZE];
    int i= blockIdx.x * blockDim.x + threadIdx.x;
    if (i < InputSize) {
        XY[threadIdx.x] = X[i];
    }
// the code below performs iterative scan on XY
    for (unsigned int stride = 1; stride <= threadIdx.x; stride *= 2) {
         __syncthreads();
        float in1 = XY[threadIdx.x - stride];
        __syncthreads();
        XY[threadIdx.x] += in1; 
    }
    __ syncthreads(); 
    If (i < InputSize) {
        Y[i] = XY[threadIdx.x];
    } 
}

A Work-Efficient Inclusive Scan Kernel

__global__ void work_efficient_scan_kernel(float *X, float *Y, int InputSize){
     __shared__ float XY[SECTION_SIZE];
    int i= blockIdx.x * blockDim.x + threadIdx.x;
    if (i < InputSize) {
        XY[threadIdx.x] = X[i];
    }
    for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) { 
    __syncthreads();
    int index = (threadIdx.x+1) * 2* stride -1;
    if (index < SECTION_SIZE) {
        XY[index] += XY[index - stride]; 
    }
}

for (int stride = blockDim.x /2; stride > 0; stride /= 2) { 
    __syncthreads();
    int index = (threadIdx.x+1) * stride * 2 -1;
    if (index + stride < SECTION_SIZE) {
    XY[index + stride] += XY[index]; 
    }
}
__syncthreads();
Y[i] = XY[threadIdx.x];

Module 11 稀疏矩阵

串行SpMV/CSR

for (int row = 0; row < num_rows; row++) {
    float dot = 0;
    int row_start = row_ptr[row];
    int row_end = row_ptr[row+1];
    for (int elem = row_start; elem < row_end; elem++) { 
        dot += data[elem] * x[col_index[elem]];
    }
    y[row] += dot;
}

并行SpMV/CSR（核函数）

__global__ void SpMV_CSR(int num_rows, float *data, int *col_index, int *row_ptr, float *x,float *y) {
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row < num_rows) {
        float dot = 0;
        int row_start = row_ptr[row];
        int row_end = row_ptr[row+1];
        for (int elem = row_start; elem < row_end; elem++) {
            dot += data[elem] * x[col_index[elem]]; 
        }
        y[row] += dot;
     }
}

并行SpMV/ELL（核函数）

__global__ void SpMV_ELL(int num_rows, float *data, int *col_index, int num_elem, float *x, float *y)
{
    int row = blockIdx.x * blockDim.x + threadIdx.x; 
    int col;
    if (row < num_rows) { 
        float dot = 0;
        for (int i = 0; i < num_elem; i++) {
            col = col_index[row+i*num_rows];
            dot += data[row+i*num_rows] * x[col];
        }
        y[row] += dot; 
    }
}

SpMV kernel using ELL-COO hybrid format

__global__ void SpMV_Hybrid(int num_rows, float *data_ell, int *col_index_ell, int num_elem_ell, float *data_coo, int *row_index_coo, int *col_index_coo, int num_elem_coo, float *x, float *y) {
    int row = blockIdx.x * blockDim.x + threadIdx.x; 
    int i;
    if (row < num_rows) {
        float dot = 0;
// Process using ELL
        for (i = 0; i < num_elem_ell; i++) {
// Assuming -1 is used to indicate no entry
            int col = col_index_ell[row + i * num_rows];
            if (col != -1) { 
// Assuming -1 is used to indicate no entry
                dot += data_ell[row+i*num_rows] * x[col]; 
            }
        }
// Process using COO
        for (i = 0; i < num_elem_coo; i++) { 
            if (row_index_coo[i] == row) {
                dot += data_coo[i] * x[col_index_coo[i]]; 
            }
        }
    }
    y[row] += dot;
}

SpMV kernel using JDS format

__global__ void spmv_jds (
    float * data, // Non-zero values in JDS format 
    int * col_index, // Column indices in JDS format 
    int*row_perm, //Rowpermutationarray
    int * row_nnz, // The number of non-zero elements per row 
    float* x, //Input vector x
    float* y,//Output vector y
    int num_rows// Number of rows in the matrix
){
    int row = blockDim.x * blockIdx.x + threadIdx.x; 
    if (row < num_rows) {
        float dot = 0.0f;
        int row_start = 0;
        for (int j = 0; j < row; j++) {
            row_start += row_nnz[j]; 
        }
        int row_end = row_start + row_nnz[row];
        for (int j = row_start; j < row_end; j++) { 
            dot += data[j] * x[col_index[j]];
        }
        y[row_perm[row]] = dot; 
    }
}

代码来自PPT，仅供期末复习参考