1.按照以下几种方式优化并行矩阵乘
•
朴素实现
•
线程块tile+共享内存
•
线程tile
•
转置、向量化访存
•
double buffer
/*
并行矩阵乘
使用方法示例:./run.sh 4096
*/
#include <stdio.h>
#include <stdlib.h>
//使用cublas需要加上这个头文件
#include <cublas_v2.h>
//计算位于矩阵(行优先)里第row行,第col列的元素的全局索引,ld是矩阵的宽度
#define OFFSET(row, col, ld) ((row) * (ld) + (col))
// 按照float4去取
#define FETCH_FLOAT4(pointer) (reinterpret_cast<float4*>(&(pointer))[0])
//sgemm-主机代码
void SgemmonHost(float *A , float *B , float *C, int M , int N , int K)
{
for(int i = 0; i < M; i++)
{
for(int j = 0; j < N; j++)
{
float tmp = 0;
for(int k = 0; k < K; k++)
{
tmp += A[i * K + k] * B[k * N + j]; //A[i][k] * B[k][j]
}
C[i * N + j] = tmp; //C[i][j]
}
}
}
// sgemm-单个block
__global__ void SGEMMBlockKernel(float *A, float *B, float *C, int M, int N, int K)
{
float tmp = 0.0;
int tx = threadIdx.x , ty = threadIdx.y;
for(int i = 0; i < K; i++)
{
tmp += A[ty * K + i] * B[i * N + tx]; //A[ty][i] * B[i][tx]
}
C[ty * N + tx] = tmp; //C[ty][tx]
}
// SGEMM-多个block naive实现
__global__ void SGEMMKernelNaive(float *A, float *B, float *C, int M, int N, int K)
{
int tx = blockIdx.x * blockDim.x + threadIdx.x;
int ty = blockIdx.y * blockDim.y + threadIdx.y;
if(tx < M && ty < N)
{
float tmp = 0.0;
for(int i = 0; i < K; i++)
{
tmp += A[ty * K + i] * B[i * N + tx]; //A[ty][k] * B[k][tx]
}
C[ty * N + tx] = tmp; //C[ty][tx]
}
}
// SGEMM-线程块tile+共享内存实现
template <
const int BLOCK_SIZE_M, // 单个线程块计算A的高度bm
const int BLOCK_SIZE_K, // 单个线程块单轮迭代计算的A的宽度,B的高度bk
const int BLOCK_SIZE_N // 单个线程块计算B的宽度bn
>
__global__ void SGEMMBlockTileKernal(float *A, float *B, float *C, int M, int N, int K)
{
__shared__ float As[BLOCK_SIZE_M][BLOCK_SIZE_K];
__shared__ float Bs[BLOCK_SIZE_K][BLOCK_SIZE_N];
int ty = blockIdx.y * BLOCK_SIZE_M + threadIdx.y; // C内第几行=A内第几行=B内第几列
int tx = blockIdx.x * BLOCK_SIZE_N + threadIdx.x; // C内第几列=A内第几列=B内第几行
if (tx < M && ty < N)
{
float tmp = 0.0;
for (int i = 0; i < (K / BLOCK_SIZE_K); i++)
{
// 第几行row * 矩阵A的宽度 + 第i块*每块的宽度 + 块内横向偏移量
As[threadIdx.y][threadIdx.x] = A[ty * K + (i * BLOCK_SIZE_K + threadIdx.x)];
// 第几列 + (第i块*每块的高度 + 块内纵向的偏移量) * 矩阵B的宽度
Bs[threadIdx.y][threadIdx.x] = B[tx + (i * BLOCK_SIZE_K + threadIdx.y) * N];
__syncthreads();
for (int k = 0; k < BLOCK_SIZE_K; k++)
tmp += As[threadIdx.y][k] * Bs[k][threadIdx.x];
__syncthreads();
}
C[ty * N + tx] = tmp; // C[ty][tx]
}
}
template <
const int BLOCK_SIZE_M, // 单个线程块计算A的高度
const int BLOCK_SIZE_K, // 单个线程块单轮迭代计算的A的宽度,B的高度
const int BLOCK_SIZE_N, // 单个线程块计算B的宽度
const int THREAD_SIZE_Y, // 单个线程计算的高度(结果矩阵C)
const int THREAD_SIZE_X // 单个线程计算的宽度(结果矩阵C)
>
__global__ void SGEMMBestKernal(
float * __restrict__ A,
float * __restrict__ B,
float * __restrict__ C,
const int M,
const int N,
const int K) {
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
const int THREAD_X_PER_BLOCK = BLOCK_SIZE_N / THREAD_SIZE_X;
const int THREAD_Y_PER_BLOCK = BLOCK_SIZE_M / THREAD_SIZE_Y;
const int THREAD_NUM_PER_BLOCK = THREAD_X_PER_BLOCK * THREAD_Y_PER_BLOCK;
const int tid = ty * THREAD_X_PER_BLOCK + tx;
__shared__ float As[2][BLOCK_SIZE_K][BLOCK_SIZE_M];
__shared__ float Bs[2][BLOCK_SIZE_K][BLOCK_SIZE_N];
float tmpC[THREAD_SIZE_Y][THREAD_SIZE_X] = {0};
float tmpA[2][THREAD_SIZE_Y];
float tmpB[2][THREAD_SIZE_X];
const int a_load_count = BLOCK_SIZE_M * BLOCK_SIZE_K / (THREAD_NUM_PER_BLOCK * 4);
const int b_load_count = BLOCK_SIZE_K * BLOCK_SIZE_N / (THREAD_NUM_PER_BLOCK * 4);
float load_global_a[4 * a_load_count];
float load_global_b[4 * b_load_count];
const int A_TILE_THREAD_PER_ROW = BLOCK_SIZE_K / 4;
const int B_TILE_THREAD_PER_ROW = BLOCK_SIZE_N / 4;
const int A_TILE_ROW_START = tid / A_TILE_THREAD_PER_ROW;
const int A_TILE_COL_START = tid % A_TILE_THREAD_PER_ROW * 4;
const int B_TILE_ROW_START = tid / B_TILE_THREAD_PER_ROW;
const int B_TILE_COL_START = tid % B_TILE_THREAD_PER_ROW * 4;
const int A_TILE_ROW_STRIDE = THREAD_NUM_PER_BLOCK / A_TILE_THREAD_PER_ROW;
const int B_TILE_ROW_STRIDE = THREAD_NUM_PER_BLOCK / B_TILE_THREAD_PER_ROW;
A = &A[(BLOCK_SIZE_M * by)* K];
B = &B[BLOCK_SIZE_N * bx];
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
int load_global_idx = i / A_TILE_ROW_STRIDE * 4;
FETCH_FLOAT4(load_global_a[load_global_idx]) = FETCH_FLOAT4(A[OFFSET(
A_TILE_ROW_START + i,
A_TILE_COL_START,
K )]);
As[0][A_TILE_COL_START][A_TILE_ROW_START + i]=load_global_a[load_global_idx];
As[0][A_TILE_COL_START+1][A_TILE_ROW_START + i]=load_global_a[load_global_idx+1];
As[0][A_TILE_COL_START+2][A_TILE_ROW_START + i]=load_global_a[load_global_idx+2];
As[0][A_TILE_COL_START+3][A_TILE_ROW_START + i]=load_global_a[load_global_idx+3];
}
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
FETCH_FLOAT4(Bs[0][B_TILE_ROW_START + i][B_TILE_COL_START]) = FETCH_FLOAT4(B[OFFSET(
B_TILE_ROW_START + i,
B_TILE_COL_START,
N )]);
}
__syncthreads();
#pragma unroll
for (int i = 0; i < THREAD_SIZE_Y; i += 4) {
FETCH_FLOAT4(tmpA[0][i]) = FETCH_FLOAT4(As[0][0][THREAD_SIZE_Y * ty + i]);
}
#pragma unroll
for (int i = 0; i < THREAD_SIZE_X; i += 4) {
FETCH_FLOAT4(tmpB[0][i]) = FETCH_FLOAT4(Bs[0][0][THREAD_SIZE_X * tx + i]);
}
int share_write_flag = 1;
int tile_idx = 0;
while(tile_idx < K){
tile_idx += BLOCK_SIZE_K;
#pragma unroll
for (int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
int load_global_idx = i / A_TILE_ROW_STRIDE * 4;
FETCH_FLOAT4(load_global_a[load_global_idx]) = FETCH_FLOAT4(A[OFFSET(
A_TILE_ROW_START + i, // row
A_TILE_COL_START + tile_idx, // col
K )]);
}
#pragma unroll
for (int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
int load_global_idx = i / B_TILE_ROW_STRIDE * 4;
FETCH_FLOAT4(load_global_b[load_global_idx]) = FETCH_FLOAT4(B[OFFSET(
tile_idx + B_TILE_ROW_START + i, // row
B_TILE_COL_START, // col
N )]);
}
int share_read_flag = share_write_flag ^ 1;
#pragma unroll
for(int j = 0; j < BLOCK_SIZE_K - 1; ++j){
#pragma unroll
for (int i = 0; i < THREAD_SIZE_Y; i += 4) {
FETCH_FLOAT4(tmpA[(j+1)%2][i]) = FETCH_FLOAT4(As[share_read_flag][j+1][THREAD_SIZE_Y * ty + i]);
}
#pragma unroll
for (int i = 0; i < THREAD_SIZE_X; i += 4) {
FETCH_FLOAT4(tmpB[(j+1)%2][i]) = FETCH_FLOAT4(Bs[share_read_flag][j+1][THREAD_SIZE_X * tx + i]);
}
#pragma unroll
for (int ia = 0; ia < THREAD_SIZE_Y; ia++)
{
#pragma unroll
for (int ib = 0; ib < THREAD_SIZE_X; ib++)
{
tmpC[ia][ib] += tmpA[j % 2][ia] * tmpB[j % 2][ib];
}
}
}
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_M ; i += A_TILE_ROW_STRIDE) {
int load_global_idx = i / A_TILE_ROW_STRIDE * 4;
As[share_write_flag][A_TILE_COL_START][A_TILE_ROW_START + i]=load_global_a[load_global_idx];
As[share_write_flag][A_TILE_COL_START+1][A_TILE_ROW_START + i]=load_global_a[load_global_idx+1];
As[share_write_flag][A_TILE_COL_START+2][A_TILE_ROW_START + i]=load_global_a[load_global_idx+2];
As[share_write_flag][A_TILE_COL_START+3][A_TILE_ROW_START + i]=load_global_a[load_global_idx+3];
}
#pragma unroll
for ( int i = 0 ; i < BLOCK_SIZE_K; i += B_TILE_ROW_STRIDE) {
int load_global_idx = i / B_TILE_ROW_STRIDE * 4;
FETCH_FLOAT4(Bs[share_write_flag][B_TILE_ROW_START + i][B_TILE_COL_START]) = FETCH_FLOAT4(load_global_b[load_global_idx]);
}
__syncthreads();
#pragma unroll
for (int i = 0; i < THREAD_SIZE_Y; i += 4)
{
FETCH_FLOAT4(tmpA[0][i]) = FETCH_FLOAT4(As[share_read_flag^1][0][THREAD_SIZE_Y * ty + i]);
}
#pragma unroll
for (int i = 0; i < THREAD_SIZE_X; i += 4)
{
FETCH_FLOAT4(tmpB[0][i]) = FETCH_FLOAT4(Bs[share_read_flag^1][0][THREAD_SIZE_X * tx + i]);
}
#pragma unroll
for (int ia = 0; ia < THREAD_SIZE_Y; ia++) {
#pragma unroll
for (int ib = 0; ib < THREAD_SIZE_X; ib++) {
tmpC[ia][ib] += tmpA[1][ia] * tmpB[1][ib];
}
}
share_write_flag ^= 1;
}
#pragma unroll
for (int i = 0; i < THREAD_SIZE_Y; ++i) {
#pragma unroll
for (int j = 0; j < THREAD_SIZE_X; j+=4) {
FETCH_FLOAT4(C[OFFSET(
BLOCK_SIZE_M * by + ty * THREAD_SIZE_Y + i,
BLOCK_SIZE_N * bx + tx * THREAD_SIZE_X + j,
N)]) = FETCH_FLOAT4(tmpC[i][j]);
}
}
}
int main(int argc, char** argv) {
// Matrix A (M*K)j, Matrix B (K*N), Matrix C(M*N)
// 计算C = A*B
size_t M = 4096; //注意是4的倍数
size_t K = 4096;
size_t N = 4096;
if (argc == 4) {
M = atoi(argv[1]);
K = atoi(argv[2]);
N = atoi(argv[3]);
}
float *h_A = (float *)malloc(M * K * sizeof(float));
float *h_B = (float *)malloc(K * N * sizeof(float));
float *h_C = (float *)malloc(M * N * sizeof(float));
float *h_C_base = (float *)malloc(M * N * sizeof(float));
float *d_A;
float *d_B;
float *d_C;
cudaMalloc(&d_A, M * K * sizeof(float));
cudaMalloc(&d_B, K * N * sizeof(float));
cudaMalloc(&d_C, M * N * sizeof(float));
double msPerMatrixMul[2] = {0, 0}; // 矩阵乘完成平均时间,第一个是我们编写的kernel,第二个是cublas的
double gflops[2] = {0, 0}; // 性能,第一个是我们编写的kernel,第二个是cublas的
double flopsPerMatrixMul = 2.0 * M * N * K; // 浮点数计算次数:计算每个元素需要K个乘法和K个加法,那么计算完所有的元素就是(2K-1)MN次运算,再除以所需时间。
// 这里的参数跟SM的片内寄存器总量和共享内存大小有关,因为会影响活跃线程线程束与最大并发线程束的比值(occupancy)。
// 使用CUDA Toolkit提供的相关工具测量,得到的最佳分块数据如下。
const int BLOCK_SIZE_M = 128;
const int BLOCK_SIZE_K = 8;
const int BLOCK_SIZE_N = 128;
const int THREAD_SIZE_X = 8;
const int THREAD_SIZE_Y = 8;
// A B数组初始化
// generate A
for( int i = 0; i < M * K; i++ ){
h_A[i] = i / 13;
}
// generate B
for( int i = 0; i < K * N; i++ ) {
h_B[i] = i % 13;
}
// 设备内存初始化
cudaMemcpy(d_A, h_A, M * K * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, K * N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice);
float msTotal = 0; //总时间(ms)
int nIter = 10; //重复实验次数
//开始计时
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
for (int i = 0 ; i < nIter; i++ ) {
dim3 dimBlock(BLOCK_SIZE_N / THREAD_SIZE_X, BLOCK_SIZE_M / THREAD_SIZE_Y);
dim3 dimGrid(N / BLOCK_SIZE_N, M / BLOCK_SIZE_M);
SGEMMBestKernal<BLOCK_SIZE_M, BLOCK_SIZE_K, BLOCK_SIZE_N, THREAD_SIZE_Y, THREAD_SIZE_X>
<<< dimGrid, dimBlock >>>(d_A, d_B, d_C, M, N, K);
}
// 结束计时,拷贝结果
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&msTotal, start, stop);
cudaMemcpy( h_C, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);
msPerMatrixMul[0] = msTotal / nIter; // 平均时间
gflops[0] = (flopsPerMatrixMul * 1.0e-9f) / (msPerMatrixMul[0] / 1000.0f); // GFlop/s
printf("矩阵A大小: %d*%d, 矩阵B大小: %d*%d, 矩阵C大小: %d*%d\n" , M , K , K , N , M , N);
printf("线程块tile参数: bm:%d, bk:%d, bn:%d\n线程tile参数: rm:%d, rn:%d\n" ,
BLOCK_SIZE_M , BLOCK_SIZE_K , BLOCK_SIZE_N , THREAD_SIZE_Y , THREAD_SIZE_X);
printf( "mygemm 性能= %.2f GFlop/s, 运行时间= %.3f ms, 浮点数计算总次数= %.0f Ops,\n",
gflops[0],
msPerMatrixMul[0],
flopsPerMatrixMul);
// cublas
cublasHandle_t blas_handle;
cublasCreate(&blas_handle);
float alpha = 1.0;
float beta = 0;
cudaMemcpy(d_C, h_C, M * N * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
for (int i = 0 ; i < nIter; i ++ ) {
cublasSgemm (blas_handle, CUBLAS_OP_T, CUBLAS_OP_T,
M, N, K, &alpha,
d_A, K, d_B, N, &beta, d_C, N
);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&msTotal, start, stop);
cudaMemcpy( h_C_base, d_C, M * N * sizeof(float), cudaMemcpyDeviceToHost);
msPerMatrixMul[1] = msTotal / nIter;
gflops[1] = (flopsPerMatrixMul * 1.0e-9f) / (msPerMatrixMul[1] / 1000.0f);
//输出性能结果
printf( "cublas 性能= %.2f GFlop/s, 运行时间= %.3f ms, 浮点数计算总次数= %.0f Ops,\n",
gflops[1],
msPerMatrixMul[1],
flopsPerMatrixMul);
cublasDestroy(blas_handle);
//测试浮点数误差
// test relative error by the formula
// |<x, y>_mygemm - <x,y>_cublas|/<|x|, |y|> < eps
double eps = 1.e-6; // machine zero
bool correct = true;
for (int i = 0; i < M * N; i++) {
int row = i / N;
int col = i % N;
double abs_err = fabs(h_C[i] - h_C_base[col * M + row]);
double dot_length = M;
double abs_val = fabs(h_C[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps) {
printf("Error! Matrix[%05d]=%.8f, ref=%.8f error term is > %E\n",
i, h_C[i], h_C_base[col * M + row], eps);
correct = false;
break;
}
}
printf("%s\n", correct ? "精度校验通过" : "精度校验未通过");
printf("达到了cublas库 %f%的计算性能\n", gflops[0] / gflops[1] * 100.0);
// 释放内存
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
free(h_A);
free(h_B);
free(h_C);
free(h_C_base);
}
2.按照以下几种方式优化并行矩阵向量乘
•
朴素实现
•
合并访存
•
常量内存
•
共享内存
/*
单精度矩阵向量乘
chmod +x run.sh && ./run.sh 13 13
*/
#include <stdio.h>
#include <math.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
//朴素实现
__global__ void sgemvNaive(float *A, float *X, float *Y, int M, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x; //对应着转置之后的矩阵的行索引
if (i < M)
{
float tmp = 0;
for (int j = 0; j < N; j++)
tmp += A[i * N + j] * X[j];
Y[i] = tmp;
}
}
//使用合并访存优化
__global__ void sgemvCoalesced(float *At, float *X, float *Y, int M, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x; //对应着转置之后的矩阵的列索引
if (i < M)
{
float tmp = 0;
for (int j = 0; j < N; j++)
tmp += At[j * M + i] * X[j];
Y[i] = tmp;
}
}
//使用常量内存优化
__constant__ float d_CX[(1 << 14)]; // 最大64KB float
__global__ void sgemvConstant(float *At, float *X, float *Y, int M, int N)
{
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < M)
{
float tmp = 0;
for (int j = 0; j < N; j++)
tmp += At[j * M + i] * d_CX[j];
Y[i] = tmp;
}
}
//使用共享内存优化
//block_size 分块的大小
template <size_t block_size>
__global__ void sgemvSharedMemory(float *At, float *X, float *Y, int M, int N)
{
__shared__ float s_X[block_size];
int i = blockDim.x * blockIdx.x + threadIdx.x;
if(i < M)
{
float tmp = 0;
int j_last_block_index = N / block_size * block_size;
#pragma unroll
for(int j = 0; j < j_last_block_index; j += block_size)
{
s_X[threadIdx.x] = X[j + threadIdx.x];
__syncthreads();
#pragma unroll
for(int k = 0; k < block_size; k++)
{
tmp += At[(k+j) * M + i] * s_X[k];//A[i][j] At[j+k][i] * s_X[k]
}
__syncthreads();
}
if(j_last_block_index != N)
{
s_X[threadIdx.x] = X[j_last_block_index + threadIdx.x];
__syncthreads();
#pragma unroll
for(int k = 0; k < (N - j_last_block_index); k++)
{
tmp += At[(k + j_last_block_index) * M + i] * s_X[k];
}
}
Y[i] = tmp;
}
}
bool verifyResult(float *h_Y_base , float *h_Y , int M)
{
// 测试浮点数误差
double eps = 1.e-6;
bool correct = true;
for (int i = 0; i < M; i++)
{
double abs_err = fabs(h_Y[i] - h_Y_base[i]);
double dot_length = M;
double abs_val = fabs(h_Y[i]);
double rel_err = abs_err / abs_val / dot_length;
if (rel_err > eps)
{
printf("Error! Y[%d]=%.8f, ref=%.8f error term is > %E\n", i, h_Y[i], h_Y_base[i], eps);
correct = false;
break;
}
}
return correct;
}
int main(int argc, char **argv)
{
size_t M = 1 << 14;
size_t N = 1 << 14;
if (argc == 3)
{
M = 1 << (atoi(argv[1]));
N = 1 << (atoi(argv[2]));
}
float *h_A = (float *)malloc(M * N * sizeof(float));
float *h_At = (float *)malloc(N * M * sizeof(float));
float *h_X = (float *)malloc(N * sizeof(float));
float *h_Y = (float *)malloc(M * sizeof(float));
float *h_Y_base = (float *)malloc(M * sizeof(float));
float *d_A;
float *d_At;
float *d_X;
float *d_Y;
cudaMalloc(&d_A, M * N * sizeof(float));
cudaMalloc(&d_At, M * N * sizeof(float));
cudaMalloc(&d_X, N * sizeof(float));
cudaMalloc(&d_Y, M * sizeof(float));
double msPerMatrixVectorMul[5] = {0, 0, 0, 0, 0}; //5种计算的计时
double gflops[5] = {0, 0, 0, 0, 0}; //5种计算的gflops
double flopsPerMatrixVectorMul = 2.0 * M * N; //浮点数计算次数:计算M个元素,每个元素需要N个乘法和N个加法
// A X数组初始化
for (int j = 0; j < N; j++)
{
h_X[j] = (float)j/N;
for (int i = 0; i < M; i++)
{
h_At[j * M + i] = h_A[i * N + j] = 1.0;
}
}
printf("矩阵A大小: %d*%d, 向量X大小: %d, 向量Y大小: %d\n", M, N, N, M);
// 设备内存初始化
cudaMemcpy(d_A, h_A, sizeof(float) * M * N, cudaMemcpyHostToDevice);
cudaMemcpy(d_At, h_At, sizeof(float) * N * M, cudaMemcpyHostToDevice);
cudaMemcpy(d_X, h_X, sizeof(float) * N, cudaMemcpyHostToDevice);
float msTotal = 0; // 每次的总时间(ms)
int nIter = 10; // 重复实验次数
const int block_size = 1 << 7;
int grid_size = (M + block_size - 1) / block_size;
// 开始计时
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//(1) cublas
cublasHandle_t blas_handle;
cublasCreate(&blas_handle);
float alpha = 1.0, beta = 0;
cudaMemcpy(d_Y, h_Y_base, M * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
for (int i = 0; i < nIter; i++)
{
cublasSgemv(blas_handle, CUBLAS_OP_T,
N, M, &alpha, d_A,
N, d_X, 1, &beta, d_Y, 1);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&msTotal, start, stop);
cublasDestroy(blas_handle);
cudaMemcpy(h_Y_base, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
msPerMatrixVectorMul[0] = msTotal / nIter;
gflops[0] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[0] / 1000.0f);
// 输出性能结果
printf("cublas 性能= %.2f GFlop/s, 运行时间= %.3f ms\n\n",
gflops[0],
msPerMatrixVectorMul[0]);
//(2) naive实现
cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
for (int i = 0; i < nIter; i++)
{
sgemvNaive<<<grid_size, block_size>>>(d_A, d_X, d_Y, M, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&msTotal, start, stop);
cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
msPerMatrixVectorMul[1] = msTotal / nIter; // 平均时间
gflops[1] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[1] / 1000.0f); // GFlop/s
printf("sgemvNaive 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
gflops[1],
msPerMatrixVectorMul[1]);
printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
printf("达到了cublas库 %f%的计算性能\n\n", gflops[1] / gflops[0] * 100.0);
//(3) Coalesced
memset(h_Y , 0.0 , M * sizeof(float));
cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
for (int i = 0; i < nIter; i++)
{
sgemvCoalesced<<<grid_size, block_size>>>(d_At, d_X, d_Y, M, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&msTotal, start, stop);
cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
msPerMatrixVectorMul[2] = msTotal / nIter; // 平均时间
gflops[2] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[2] / 1000.0f); // GFlop/s
printf("sgemvCoalesced 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
gflops[2],
msPerMatrixVectorMul[2]);
printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
printf("达到了cublas库 %f%的计算性能\n\n", gflops[2] / gflops[0] * 100.0);
//(4) Constant memory
if(N <= (1 << 14)) //如果超过常量内存大小,不用常量内存优化
{
cudaMemcpyToSymbol(d_CX, h_X, sizeof(float) * N);
memset(h_Y , 0.0 , M * sizeof(float));
cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
for (int i = 0; i < nIter; i++)
{
sgemvConstant<<<grid_size, block_size>>>(d_At, d_X, d_Y, M, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&msTotal, start, stop);
cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
msPerMatrixVectorMul[3] = msTotal / nIter; // 平均时间
gflops[3] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[3] / 1000.0f); // GFlop/s
printf("sgemvConstant 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
gflops[3],
msPerMatrixVectorMul[3]);
printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
printf("达到了cublas库 %f%的计算性能\n\n", gflops[3] / gflops[0] * 100.0);
}
//(5) shared memory
memset(h_Y , 0.0 , M * sizeof(float));
cudaMemcpy(d_Y, h_Y, M * sizeof(float), cudaMemcpyHostToDevice);
cudaEventRecord(start);
for (int i = 0; i < nIter; i++)
{
sgemvSharedMemory<block_size><<<grid_size, block_size>>>(d_At, d_X, d_Y, M, N);
}
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&msTotal, start, stop);
cudaMemcpy(h_Y, d_Y, M * sizeof(float), cudaMemcpyDeviceToHost);
msPerMatrixVectorMul[4] = msTotal / nIter; // 平均时间
gflops[4] = (flopsPerMatrixVectorMul * 1.0e-9f) / (msPerMatrixVectorMul[4] / 1000.0f); // GFlop/s
printf("sgemvSharedMemory 性能= %.2f GFlop/s, 运行时间= %.3f ms, ",
gflops[4],
msPerMatrixVectorMul[4]);
printf("%s\n", verifyResult(h_Y_base , h_Y , M) ? "精度校验通过" : "精度校验未通过");
printf("达到了cublas库 %f%的计算性能\n\n", gflops[4] / gflops[0] * 100.0);
cudaFree(d_A);
cudaFree(d_At);
cudaFree(d_X);
cudaFree(d_CX);
cudaFree(d_Y);
free(h_A);
free(h_X);
free(h_Y);
free(h_Y_base);
}