CUDA的矩阵乘优化

1.按照以下几种方式优化并行矩阵乘

朴素实现
线程块tile+共享内存
线程tile
转置、向量化访存
double buffer
/*
并行矩阵乘
使用方法示例:./run.sh 4096
*/
#include <stdio.h>
#include <stdlib.h>
//使用cublas需要加上这个头文件
#include <cublas_v2.h>

//计算位于矩阵(行优先)里第row行,第col列的元素的全局索引,ld是矩阵的宽度
#define OFFSET(row, col, ld) ((row) * (ld) + (col))

// 按照float4去取
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])

//sgemm-主机代码
void SgemmonHost(float *A , float *B , float *C, int M , int N , int K)
{
    for(int i = 0; i < M; i++)
    {
        for(int j = 0; j < N; j++)
        {
            float tmp = 0;
            for(int k = 0; k < K; k++)
            {
                tmp += A[i * K + k] * B[k * N + j]; //A[i][k] * B[k][j]
            }
            C[i * N + j] = tmp; //C[i][j]
        }
    }
}

// sgemm-单个block
__global__ void SGEMMBlockKernel(float *A, float *B, float *C, int M, int N, int K)
{
    float tmp = 0.0;
    int tx = threadIdx.x , ty = threadIdx.y;
    for(int i = 0; i < K; i++)
    {
        tmp += A[ty * K + i] * B[i * N + tx]; //A[ty][i] * B[i][tx]
    }
    C[ty * N + tx] = tmp; //C[ty][tx]
}

// SGEMM-多个block naive实现
__global__ void SGEMMKernelNaive(float *A, float *B, float *C, int M, int N, int K)
{
    int tx = blockIdx.x * blockDim.x + threadIdx.x;
    int ty = blockIdx.y * blockDim.y + threadIdx.y;
    if(tx < M && ty < N)
    {
        float tmp = 0.0;
        for(int i = 0; i < K; i++)
        {
            tmp += A[ty * K + i] * B[i * N + tx]; //A[ty][k] * B[k][tx]
        }
        C[ty * N + tx] = tmp; //C[ty][tx]
    }
}

// SGEMM-线程块tile+共享内存实现
template <
    const int BLOCK_SIZE_M,  // 单个线程块计算A的高度bm
    const int BLOCK_SIZE_K,  // 单个线程块单轮迭代计算的A的宽度,B的高度bk
    const int BLOCK_SIZE_N  // 单个线程块计算B的宽度bn
    > 
__global__ void SGEMMBlockTileKernal(float *A, float *B, float *C, int M, int N, int K)
{
    __shared__ float As[BLOCK_SIZE_M][BLOCK_SIZE_K];            
    __shared__ float Bs[BLOCK_SIZE_K][BLOCK_SIZE_N];            
    int ty = blockIdx.y * BLOCK_SIZE_M + threadIdx.y; // C内第几行=A内第几行=B内第几列
    int tx = blockIdx.x * BLOCK_SIZE_N + threadIdx.x; // C内第几列=A内第几列=B内第几行
    if (tx < M && ty < N)
    {
        float tmp = 0.0;
        for (int i = 0; i < (K / BLOCK_SIZE_K); i++)
        {
            // 第几行row * 矩阵A的宽度 + 第i块*每块的宽度 + 块内横向偏移量
            As[threadIdx.y][threadIdx.x] = A[ty * K + (i * BLOCK_SIZE_K + threadIdx.x)];
            // 第几列 + (第i块*每块的高度 + 块内纵向的偏移量) * 矩阵B的宽度
            Bs[threadIdx.y][threadIdx.x] = B[tx + (i * BLOCK_SIZE_K + threadIdx.y) * N];
            __syncthreads();
            for (int k = 0; k < BLOCK_SIZE_K; k++)
                tmp += As[threadIdx.y][k] * Bs[k][threadIdx.x];
            __syncthreads();
        }
        C[ty * N + tx] = tmp; // C[ty][tx]
    }
}



template <
    const int BLOCK_SIZE_M,  // 单个线程块计算A的高度
    const int BLOCK_SIZE_K,  // 单个线程块单轮迭代计算的A的宽度,B的高度
    const int BLOCK_SIZE_N,  // 单个线程块计算B的宽度
    const int THREAD_SIZE_Y, // 单个线程计算的高度(结果矩阵C)
    const int THREAD_SIZE_X  // 单个线程计算的宽度(结果矩阵C)
    > 
__global__ void SGEMMBestKernal( 
    float * __restrict__ A,
    float * __restrict__ B,
    float * __restrict__ C, 
    const int M,
    const int N,
    const int K) {  
    int bx = blockIdx.x;   
    int by = blockIdx.y;  
    int tx = threadIdx.x;   
    int ty = threadIdx.y;  
    const int THREAD_X_PER_BLOCK = BLOCK_SIZE_N / THREAD_SIZE_X; 
    const int THREAD_Y_PER_BLOCK = BLOCK_SIZE_M / THREAD_SIZE_Y; 
    const int THREAD_NUM_PER_BLOCK = THREAD_X_PER_BLOCK * THREAD_Y_PER_BLOCK; 
    const int tid = ty * THREAD_X_PER_BLOCK + tx;   
    __shared__ float As[2][BLOCK_SIZE_K][BLOCK_SIZE_M];
    __shared__ float Bs[2][BLOCK_SIZE_K][BLOCK_SIZE_N]; 
    float tmpC[THREAD_SIZE_Y][THREAD_SIZE_X] = {0};
    float tmpA[2][THREAD_SIZE_Y];
    float tmpB[2][THREAD_SIZE_X];
    const int a_load_count = BLOCK_SIZE_M * BLOCK_SIZE_K / (THREAD_NUM_PER_BLOCK * 4);
    const int b_load_count = BLOCK_SIZE_K * BLOCK_SIZE_N / (THREAD_NUM_PER_BLOCK * 4);
    float load_global_a[4 * a_load_count];
    float load_global_b[4 * b_load_count];
    const int A_TILE_THREAD_PER_ROW = BLOCK_SIZE_K / 4;
    const int B_TILE_THREAD_PER_ROW = BLOCK_SIZE_N / 4;
    const int A_TILE_ROW_START = tid / A_TILE_THREAD_PER_ROW;
    const int A_TILE_COL_START = tid % A_TILE_THREAD_PER_ROW * 4; 
    const int B_TILE_ROW_START = tid / B_TILE_THREAD_PER_ROW;
    const int B_TILE_COL_START = tid % B_TILE_THREAD_PER_ROW * 4;
    const int A_TILE_ROW_STRIDE = THREAD_NUM_PER_BLOCK / A_TILE_THREAD_PER_ROW;  
    const int B_TILE_ROW_STRIDE = THREAD_NUM_PER_BLOCK / B_TILE_THREAD_PER_ROW;
    A = &A[(BLOCK_SIZE_M * by)* K];
    B = &B[BLOCK_SIZE_N * bx];
    #pragma unroll
    for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
        int load_global_idx = i / A_TILE_ROW_STRIDE * 4; 
        FETCH_FLOAT4(load_global_a[load_global_idx]) = FETCH_FLOAT4(A[OFFSET(
            A_TILE_ROW_START + i,
            A_TILE_COL_START,
            K )]);
        As[0][A_TILE_COL_START][A_TILE_ROW_START + i]=load_global_a[load_global_idx];
        As[0][A_TILE_COL_START+1][A_TILE_ROW_START + i]=load_global_a[load_global_idx+1];
        As[0][A_TILE_COL_START+2][A_TILE_ROW_START + i]=load_global_a[load_global_idx+2];
        As[0][A_TILE_COL_START+3][A_TILE_ROW_START + i]=load_global_a[load_global_idx+3];
    }
    #pragma unroll
    for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
        FETCH_FLOAT4(Bs[0][B_TILE_ROW_START + i][B_TILE_COL_START]) = FETCH_FLOAT4(B[OFFSET(
                B_TILE_ROW_START + i, 
                B_TILE_COL_START, 
                N )]);
    }
    __syncthreads();
    #pragma unroll
    for (int i = 0; i < THREAD_SIZE_Y; i += 4) { 

        FETCH_FLOAT4(tmpA[0][i]) = FETCH_FLOAT4(As[0][0][THREAD_SIZE_Y * ty + i]);
    }
    #pragma unroll
    for (int i = 0; i < THREAD_SIZE_X; i += 4) {

        FETCH_FLOAT4(tmpB[0][i]) = FETCH_FLOAT4(Bs[0][0][THREAD_SIZE_X * tx + i]);
    }
    int share_write_flag = 1;  
    int tile_idx = 0;  
    while(tile_idx < K){
        tile_idx += BLOCK_SIZE_K;  
        #pragma unroll
        for (int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
            int load_global_idx = i / A_TILE_ROW_STRIDE * 4;
            FETCH_FLOAT4(load_global_a[load_global_idx]) = FETCH_FLOAT4(A[OFFSET(
                A_TILE_ROW_START + i, // row
                A_TILE_COL_START + tile_idx, // col
                K )]);
        }
        #pragma unroll
        for (int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
            int load_global_idx = i / B_TILE_ROW_STRIDE * 4;
            FETCH_FLOAT4(load_global_b[load_global_idx]) = FETCH_FLOAT4(B[OFFSET(
                tile_idx + B_TILE_ROW_START + i, // row
                B_TILE_COL_START, // col
                N )]);
        }
        

        int share_read_flag = share_write_flag ^ 1;   
        #pragma unroll
        for(int j = 0; j < BLOCK_SIZE_K - 1; ++j){    
            #pragma unroll
            for (int i = 0; i < THREAD_SIZE_Y; i += 4) {   
                FETCH_FLOAT4(tmpA[(j+1)%2][i]) = FETCH_FLOAT4(As[share_read_flag][j+1][THREAD_SIZE_Y * ty + i]);
            }
            #pragma unroll
            for (int i = 0; i < THREAD_SIZE_X; i += 4) { 
                FETCH_FLOAT4(tmpB[(j+1)%2][i]) = FETCH_FLOAT4(Bs[share_read_flag][j+1][THREAD_SIZE_X * tx + i]);
            }
            #pragma unroll
            for (int ia = 0; ia < THREAD_SIZE_Y; ia++) 
            {  
                #pragma unroll
                for (int ib = 0; ib < THREAD_SIZE_X; ib++) 
                {   
                    tmpC[ia][ib] += tmpA[j % 2][ia] * tmpB[j % 2][ib]; 
                }
            }
        }

            #pragma unroll
            for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {  
                int load_global_idx = i / A_TILE_ROW_STRIDE * 4;      
                As[share_write_flag][A_TILE_COL_START][A_TILE_ROW_START + i]=load_global_a[load_global_idx];
                As[share_write_flag][A_TILE_COL_START+1][A_TILE_ROW_START + i]=load_global_a[load_global_idx+1];
                As[share_write_flag][A_TILE_COL_START+2][A_TILE_ROW_START + i]=load_global_a[load_global_idx+2];
                As[share_write_flag][A_TILE_COL_START+3][A_TILE_ROW_START + i]=load_global_a[load_global_idx+3];
            }

            #pragma unroll
            for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
                int load_global_idx = i / B_TILE_ROW_STRIDE * 4; 
                FETCH_FLOAT4(Bs[share_write_flag][B_TILE_ROW_START + i][B_TILE_COL_START]) = FETCH_FLOAT4(load_global_b[load_global_idx]);
            }
        __syncthreads();
        
        #pragma unroll
        for (int i = 0; i < THREAD_SIZE_Y; i += 4) 
        {
            FETCH_FLOAT4(tmpA[0][i]) = FETCH_FLOAT4(As[share_read_flag^1][0][THREAD_SIZE_Y * ty + i]);
        }
        #pragma unroll
        for (int i = 0; i < THREAD_SIZE_X; i += 4) 
        {
            FETCH_FLOAT4(tmpB[0][i]) = FETCH_FLOAT4(Bs[share_read_flag^1][0][THREAD_SIZE_X * tx + i]);
        }
        #pragma unroll
        for (int ia = 0; ia < THREAD_SIZE_Y; ia++) {
            #pragma unroll
            for (int ib = 0; ib < THREAD_SIZE_X; ib++) {
                tmpC[ia][ib] += tmpA[1][ia] * tmpB[1][ib];
            }
        }
        share_write_flag ^= 1;

    }
    #pragma unroll
    for (int i = 0; i < THREAD_SIZE_Y; ++i) {
        #pragma unroll
        for (int j = 0; j < THREAD_SIZE_X; j+=4) {
            FETCH_FLOAT4(C[OFFSET(
                BLOCK_SIZE_M * by + ty * THREAD_SIZE_Y + i,
                BLOCK_SIZE_N * bx + tx * THREAD_SIZE_X + j,
                N)]) = FETCH_FLOAT4(tmpC[i][j]);
        }
    }
}

int main(int argc, char** argv) {
    // Matrix A (M*K)j, Matrix B (K*N), Matrix C(M*N)
    // 计算C = A*B
    size_t M = 4096;   //注意是4的倍数
    size_t K = 4096;
    size_t N = 4096;

    if (argc == 4) {
        M = atoi(argv[1]);
        K = atoi(argv[2]);
        N = atoi(argv[3]);
    }

    float *h_A = (float *)malloc(M * K * sizeof(float));
    float *h_B = (float *)malloc(K * N * sizeof(float));
    float *h_C = (float *)malloc(M * N * sizeof(float));
    float *h_C_base = (float *)malloc(M * N * sizeof(float));

    float *d_A;
    float *d_B;
    float *d_C;

    cudaMalloc(&d_A, M * K * sizeof(float));
    cudaMalloc(&d_B, K * N * sizeof(float));
    cudaMalloc(&d_C, M * N * sizeof(float));

    double msPerMatrixMul[2] = {0, 0};    // 矩阵乘完成平均时间,第一个是我们编写的kernel,第二个是cublas的
    double gflops[2] = {0, 0};  // 性能,第一个是我们编写的kernel,第二个是cublas的
    double flopsPerMatrixMul = 2.0 * M * N * K; // 浮点数计算次数:计算每个元素需要K个乘法和K个加法,那么计算完所有的元素就是(2K-1)MN次运算,再除以所需时间。

    // 这里的参数跟SM的片内寄存器总量和共享内存大小有关,因为会影响活跃线程线程束与最大并发线程束的比值(occupancy)。
    // 使用CUDA Toolkit提供的相关工具测量,得到的最佳分块数据如下。
    const int BLOCK_SIZE_M = 128;
    const int BLOCK_SIZE_K = 8;
    const int BLOCK_SIZE_N = 128;
    const int THREAD_SIZE_X = 8;
    const int THREAD_SIZE_Y = 8;

    // A B数组初始化
    // generate A
    for( int i = 0; i < M * K; i++ ){
        h_A[i] = i / 13;
    }

    // generate B
    for( int i = 0; i < K * N; i++ ) {
        h_B[i] = i % 13;
    }

    // 设备内存初始化
    cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice);


    float msTotal = 0;    //总时间(ms)
    int nIter = 10;   //重复实验次数
    //开始计时
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start);

    for (int i = 0 ; i < nIter; i++ ) {
        dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_X, BLOCK_SIZE_M / THREAD_SIZE_Y);
        dim3 dimGrid(N / BLOCK_SIZE_N, M / BLOCK_SIZE_M);
        SGEMMBestKernal<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_Y, THREAD_SIZE_X> 
        <<< dimGrid, dimBlock >>>(d_A, d_B, d_C, M, N, K);
    }

    // 结束计时,拷贝结果
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&msTotal, start, stop);
    cudaMemcpy( h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);

    msPerMatrixMul[0] = msTotal / nIter;    // 平均时间
    gflops[0] = (flopsPerMatrixMul * 1.0e-9f) / (msPerMatrixMul[0] / 1000.0f); // GFlop/s

    printf("矩阵A大小: %d*%d, 矩阵B大小: %d*%d, 矩阵C大小: %d*%d\n" , M , K , K , N , M , N);
    printf("线程块tile参数: bm:%d, bk:%d, bn:%d\n线程tile参数: rm:%d, rn:%d\n" , 
            BLOCK_SIZE_M , BLOCK_SIZE_K , BLOCK_SIZE_N , THREAD_SIZE_Y , THREAD_SIZE_X);

    printf( "mygemm 性能= %.2f GFlop/s, 运行时间= %.3f ms, 浮点数计算总次数= %.0f Ops,\n",
        gflops[0],
        msPerMatrixMul[0],
        flopsPerMatrixMul);

    // cublas
    cublasHandle_t blas_handle;
    cublasCreate(&blas_handle);
    float alpha = 1.0;
    float beta = 0;
    cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice);
    cudaEventRecord(start);
    for (int i = 0 ; i < nIter; i ++ ) {
        cublasSgemm (blas_handle, CUBLAS_OP_T, CUBLAS_OP_T, 
            M, N, K, &alpha, 
            d_A, K, d_B, N, &beta, d_C, N
        );
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&msTotal, start, stop);

    cudaMemcpy( h_C_base, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);

    msPerMatrixMul[1] = msTotal / nIter;
    gflops[1] = (flopsPerMatrixMul * 1.0e-9f) / (msPerMatrixMul[1] / 1000.0f);

    //输出性能结果
    printf( "cublas 性能= %.2f GFlop/s, 运行时间= %.3f ms, 浮点数计算总次数= %.0f Ops,\n",
        gflops[1],
        msPerMatrixMul[1],
        flopsPerMatrixMul);

    cublasDestroy(blas_handle); 
    
    //测试浮点数误差
    // test relative error by the formula
    //     |<x, y>_mygemm - <x,y>_cublas|/<|x|, |y|>  < eps
    double eps = 1.e-6;  // machine zero
    bool correct = true;
    for (int i = 0; i < M * N; i++) {
        int row = i / N;
        int col = i % N;
        double abs_err = fabs(h_C[i] - h_C_base[col * M + row]);
        double dot_length = M;
        double abs_val = fabs(h_C[i]);
        double rel_err = abs_err / abs_val / dot_length;
        if (rel_err > eps) {
            printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
                    i, h_C[i], h_C_base[col * M + row], eps);
            correct = false;
            break;
        }
    }

    printf("%s\n", correct ? "精度校验通过" : "精度校验未通过");
    printf("达到了cublas库 %f%的计算性能\n", gflops[0] / gflops[1] * 100.0);
    
    // 释放内存
    cudaFree(d_A);
    cudaFree(d_B);
    cudaFree(d_C);
    
    free(h_A);
    free(h_B);
    free(h_C);
    free(h_C_base);
}

2.按照以下几种方式优化并行矩阵向量乘

朴素实现
合并访存
常量内存
共享内存
/*
单精度矩阵向量乘
chmod +x run.sh && ./run.sh 13 13
*/

#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
//朴素实现
__global__ void sgemvNaive(float *A, float *X, float *Y, int M, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;	//对应着转置之后的矩阵的行索引
    if (i < M)
    {
        float tmp = 0;
        for (int j = 0; j < N; j++)
            tmp += A[i * N + j] * X[j];
        Y[i] = tmp;
    }
}
//使用合并访存优化
__global__ void sgemvCoalesced(float *At, float *X, float *Y, int M, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;	//对应着转置之后的矩阵的列索引
    if (i < M)
    {
        float tmp = 0;
        for (int j = 0; j < N; j++)
            tmp += At[j * M + i] * X[j];
        Y[i] = tmp;
    }
}
//使用常量内存优化
 __constant__ float d_CX[(1 << 14)]; // 最大64KB float
__global__ void sgemvConstant(float *At, float *X, float *Y, int M, int N)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i < M)
    {
        float tmp = 0;
        for (int j = 0; j < N; j++)
            tmp += At[j * M + i] * d_CX[j];
        Y[i] = tmp;
    }
}

//使用共享内存优化
//block_size 分块的大小
template <size_t block_size>
__global__ void sgemvSharedMemory(float *At, float *X, float *Y, int M, int N)
{
    __shared__ float s_X[block_size];
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if(i < M)
    {
        float tmp = 0;
        int j_last_block_index = N / block_size * block_size;
        #pragma unroll
        for(int j = 0; j < j_last_block_index; j += block_size)
        {
            s_X[threadIdx.x] = X[j + threadIdx.x];
            __syncthreads();
            #pragma unroll
            for(int k = 0; k < block_size; k++)
            {
                tmp += At[(k+j) * M + i] * s_X[k];//A[i][j] At[j+k][i] * s_X[k]
            }
            __syncthreads();
        }
        if(j_last_block_index != N)
        {
            s_X[threadIdx.x] = X[j_last_block_index + threadIdx.x];
            __syncthreads();
            #pragma unroll
            for(int k = 0; k < (N - j_last_block_index); k++)
            {
                tmp += At[(k + j_last_block_index) * M + i] * s_X[k];
            }
        }
        Y[i] = tmp;
    }
}

bool verifyResult(float *h_Y_base , float *h_Y , int M)
{
	// 测试浮点数误差
	double eps = 1.e-6;
	bool correct = true;
	for (int i = 0; i < M; i++)
	{
		double abs_err = fabs(h_Y[i] - h_Y_base[i]);
		double dot_length = M;
		double abs_val = fabs(h_Y[i]);
		double rel_err = abs_err / abs_val / dot_length;
		if (rel_err > eps)
		{
			printf("Error! Y[%d]=%.8f, ref=%.8f error term is > %E\n", i, h_Y[i], h_Y_base[i], eps);
			correct = false;
			break;
		}
	}
	return correct;
}


int main(int argc, char **argv)
{
    size_t M = 1 << 14;
    size_t N = 1 << 14;
    if (argc == 3)
    {
        M = 1 << (atoi(argv[1]));
        N = 1 << (atoi(argv[2]));
    }

    float *h_A = (float *)malloc(M * N * sizeof(float));
    float *h_At = (float *)malloc(N * M * sizeof(float));
    float *h_X = (float *)malloc(N * sizeof(float));
    float *h_Y = (float *)malloc(M * sizeof(float));
    float *h_Y_base = (float *)malloc(M * sizeof(float));

    float *d_A;
    float *d_At;
    float *d_X;
    float *d_Y;
    cudaMalloc(&d_A, M * N * sizeof(float));
    cudaMalloc(&d_At, M * N * sizeof(float));
    cudaMalloc(&d_X, N * sizeof(float));
    cudaMalloc(&d_Y, M * sizeof(float));

    double msPerMatrixVectorMul[5] = {0, 0, 0, 0, 0};   //5种计算的计时
    double gflops[5] = {0, 0, 0, 0, 0};     //5种计算的gflops
    double flopsPerMatrixVectorMul = 2.0 * M * N;   //浮点数计算次数:计算M个元素,每个元素需要N个乘法和N个加法

    // A X数组初始化
    for (int j = 0; j < N; j++)
    {
        h_X[j] = (float)j/N;
        for (int i = 0; i < M; i++)
        {
            h_At[j * M + i] = h_A[i * N + j] = 1.0;
        }
    }
	printf("矩阵A大小: %d*%d, 向量X大小: %d, 向量Y大小: %d\n", M, N, N, M);
    // 设备内存初始化
    cudaMemcpy(d_A, h_A, sizeof(float) * M * N, cudaMemcpyHostToDevice);
    cudaMemcpy(d_At, h_At, sizeof(float) * N * M, cudaMemcpyHostToDevice);
    cudaMemcpy(d_X, h_X, sizeof(float) * N, cudaMemcpyHostToDevice);
    

    float msTotal = 0; // 每次的总时间(ms)
    int nIter = 10;    // 重复实验次数

    const int block_size = 1 << 7;
    int grid_size = (M + block_size - 1) / block_size;

    // 开始计时
    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    //(1) cublas
    cublasHandle_t blas_handle;
    cublasCreate(&blas_handle);
    float alpha = 1.0, beta = 0;
    cudaMemcpy(d_Y, h_Y_base, M * sizeof(float), cudaMemcpyHostToDevice);
    cudaEventRecord(start);
    for (int i = 0; i < nIter; i++)
    {
        cublasSgemv(blas_handle, CUBLAS_OP_T,
                    N, M, &alpha, d_A,
                    N, d_X, 1, &beta, d_Y, 1);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&msTotal, start, stop);
    cublasDestroy(blas_handle);
    cudaMemcpy(h_Y_base, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
    msPerMatrixVectorMul[0] = msTotal / nIter;
    gflops[0] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[0] / 1000.0f);
    // 输出性能结果
    printf("cublas 性能= %.2f GFlop/s, 运行时间= %.3f ms\n\n",
           gflops[0],
           msPerMatrixVectorMul[0]);


	//(2) naive实现
	cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
	cudaEventRecord(start);
    for (int i = 0; i < nIter; i++)
    {
        sgemvNaive<<<grid_size, block_size>>>(d_A, d_X, d_Y, M, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&msTotal, start, stop);
    cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
    msPerMatrixVectorMul[1] = msTotal / nIter;                                             // 平均时间
    gflops[1] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[1] / 1000.0f); // GFlop/s
    printf("sgemvNaive 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
           gflops[1],
           msPerMatrixVectorMul[1]);
	printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
	printf("达到了cublas库 %f%的计算性能\n\n", gflops[1] / gflops[0] * 100.0);
    

	//(3) Coalesced
	memset(h_Y , 0.0 , M * sizeof(float));
	cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
	cudaEventRecord(start);
    for (int i = 0; i < nIter; i++)
    {
        sgemvCoalesced<<<grid_size, block_size>>>(d_At, d_X, d_Y, M, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&msTotal, start, stop);
    cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
    msPerMatrixVectorMul[2] = msTotal / nIter;                                             // 平均时间
    gflops[2] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[2] / 1000.0f); // GFlop/s
    printf("sgemvCoalesced 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
           gflops[2],
           msPerMatrixVectorMul[2]);
	printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
	printf("达到了cublas库 %f%的计算性能\n\n", gflops[2] / gflops[0] * 100.0);

    //(4) Constant memory
	if(N <= (1 << 14))	//如果超过常量内存大小,不用常量内存优化
	{
		cudaMemcpyToSymbol(d_CX, h_X, sizeof(float) * N);
		memset(h_Y , 0.0 , M * sizeof(float));
		cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
		cudaEventRecord(start);
		for (int i = 0; i < nIter; i++)
		{
			sgemvConstant<<<grid_size, block_size>>>(d_At, d_X, d_Y, M, N);
		}
		cudaEventRecord(stop);
		cudaEventSynchronize(stop);
		cudaEventElapsedTime(&msTotal, start, stop);
		cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
		msPerMatrixVectorMul[3] = msTotal / nIter;                                             // 平均时间
		gflops[3] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[3] / 1000.0f); // GFlop/s
		printf("sgemvConstant 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
			gflops[3],
			msPerMatrixVectorMul[3]);
		printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
		printf("达到了cublas库 %f%的计算性能\n\n", gflops[3] / gflops[0] * 100.0);
	}

    //(5) shared memory
	memset(h_Y , 0.0 , M * sizeof(float));
	cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
	cudaEventRecord(start);
    for (int i = 0; i < nIter; i++)
    {
        sgemvSharedMemory<block_size><<<grid_size, block_size>>>(d_At, d_X, d_Y, M, N);
    }
    cudaEventRecord(stop);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&msTotal, start, stop);
    cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
    msPerMatrixVectorMul[4] = msTotal / nIter;                                             // 平均时间
    gflops[4] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[4] / 1000.0f); // GFlop/s
    printf("sgemvSharedMemory 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
           gflops[4],
           msPerMatrixVectorMul[4]);
	printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
	printf("达到了cublas库 %f%的计算性能\n\n", gflops[4] / gflops[0] * 100.0);

    cudaFree(d_A);
    cudaFree(d_At);
    cudaFree(d_X);
	cudaFree(d_CX);
    cudaFree(d_Y);
    free(h_A);
    free(h_X);
    free(h_Y);
    free(h_Y_base);
}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值