cublas vs nervana


测试nervana 与 cublas的性能:

矩阵A:[M,K]

矩阵B:[K,N]

矩阵C:[M,N]

C = A * B

code:

test_nervana_vs_cublas.cpp

#include <iostream>
#include <nervana_c_api.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>

using namespace std;

#define S(x) #x
#define TOSTRING(x) S(x)

#define H2D cudaMemcpyHostToDevice
#define D2H cudaMemcpyDeviceToHost

const char* nervana_path = nullptr;

int test_cublas(cublasHandle_t cublas_state, float *A ,float *B,float *C, int m, int n, int k);
int test_nervana(float *A ,float *B,float *C, int m, int n, int k);

int main(int argc, char** argv)
{

    int M ,N ,K;
    M = 1024;
    N = 1024;
    K = 1024;
    cudaSetDevice(0);
    cudaError_t res = cudaFree(0);
    if (res != cudaSuccess) {
        std::cout << "CUDA did not initialize correctly" << std::endl;
        exit(1);
    }

    //nervana load
    if (!nervana_loadKernels(nervana_path != nullptr? nervana_path: TOSTRING(NERVANA_PATH))){
        std::cerr << "Couldn't load all kernels" << std::endl;
        exit(1);
    }

    //init cublas handle
    cublasHandle_t cublas_state;
    cublasCreate(&cublas_state);
    cublasSetStream(cublas_state, nullptr);

    float *d_a, *d_b, *d_c;
    float *h_a, *h_b, *h_c;

    cudaMalloc(&d_a, sizeof(float) * M * K);
    cudaMalloc(&d_b, sizeof(float) * K * N);
    cudaMalloc(&d_c, sizeof(float) * M * N);

    h_a = (float *)malloc(sizeof(float) * M * K);
    h_b = (float *)malloc(sizeof(float) * K * N);
    h_c = (float *)malloc(sizeof(float) * M * N);

    //init
    std::cerr<<"matrix a :"<<std::endl;
    for(int i=0; i<M; ++i){
        for(int j=0; j<K; ++j){
            h_a[i*K + j] = i*K + j;
        }
    }
    std::cerr<<"matrix b :"<<std::endl;
    for(int i=0; i<K; ++i){
        for(int j=0; j<N; ++j){
            h_b[i*N + j] = i*N + j ;
        }
    }
    for(int i=0; i<M; ++i){
        for(int j=0; j<N; ++j){
            h_c[i*N + j] = 0.0 ;
        }
    }

    cudaMemcpy(d_a, h_a, sizeof(float) * M * K, H2D);
    cudaMemcpy(d_b, h_b, sizeof(float) * K * N, H2D);
    cudaMemcpy(d_c, h_c, sizeof(float) * M * N, H2D);

    int test_num = 1000;

    //nervana gemm, 
   
    for(int t=0; t< test_num; ++t){
        test_nervana(d_a, d_b, d_c, M, N, K);
        cudaDeviceSynchronize();
    }

    //cublas gemm
   
    for(int t=0; t< test_num; ++t){
        test_cublas(cublas_state,d_a, d_b, d_c, M, N, K);
        cudaDeviceSynchronize();
    }
   
    cudaMemcpy(h_c, d_c, sizeof(float) * M * N, D2H);
    

    //nervana unload
    if (!nervana_unloadKernels()){
        std::cout << "unload kernel failed" << endl;
        exit(-1);
    }

    //cublas destory
    cublasDestroy(cublas_state);

    cudaFree(d_a);
    cudaFree(d_b);
    cudaFree(d_c);
    free(h_c);

    return 0;
}

//col-major
int test_cublas(cublasHandle_t cublas_state, float *A ,float *B,float *C, int m, int n, int k)
{
    float alpha = 1.0;
    float beta = 0.0;

    cublasSgemm (cublas_state,CUBLAS_OP_N, CUBLAS_OP_N,
                n,m,k,&alpha,
                B,n,
                A,k,
                &beta,
                C,n);
    return 0;
}

//row-major
int test_nervana(float *A ,float *B,float *C, int m, int n, int k)
{
    if(!nervana_sgemm(A, B, C,
            false, false,
            m, n, k,
            k, n, n,
            1.0, 0.0,
            NULL,
            false, false,
            0)){
                std::cerr<<"nervana_sgemm run error!!! "<<std::endl;
            }

    return 0;
}

Makefile文件:

OBJ = test_nervana_vs_cublas
CC = nvcc

FLAGS = -gencode=arch=compute_50,code=sm_50   -std=c++11  -DNERVANA_PATH="../nervana/cubin/"

INCLUDE_DIR = -I../

LIBS = ../libnervana.a -lcuda -lcublas

all:
    $(CC) $(FLAGS) test_nervana_vs_cublas.cpp -o $(OBJ)  $(INCLUDE_DIR) $(LIBS)
clean:
    rm -rf $(OBJ)

测试结果:

 1、测试平台GPU Quadro K1200

 2、改变M的值,测试两者性能,如下图:


3、测试结果说明:

  •     1、cublas:当M 不能被32整除时,cublas的性能急剧下降。
  •      2、nevana 性能优于cublas

注意:

1、cublas的数据是列优先存储

2、nervana的数据是行优先存储


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值