测试nervana 与 cublas的性能:
矩阵A:[M,K]
矩阵B:[K,N]
矩阵C:[M,N]
C = A * B
code:
test_nervana_vs_cublas.cpp
#include <iostream>
#include <nervana_c_api.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cublas_v2.h>
using namespace std;
#define S(x) #x
#define TOSTRING(x) S(x)
#define H2D cudaMemcpyHostToDevice
#define D2H cudaMemcpyDeviceToHost
const char* nervana_path = nullptr;
int test_cublas(cublasHandle_t cublas_state, float *A ,float *B,float *C, int m, int n, int k);
int test_nervana(float *A ,float *B,float *C, int m, int n, int k);
int main(int argc, char** argv)
{
int M ,N ,K;
M = 1024;
N = 1024;
K = 1024;
cudaSetDevice(0);
cudaError_t res = cudaFree(0);
if (res != cudaSuccess) {
std::cout << "CUDA did not initialize correctly" << std::endl;
exit(1);
}
//nervana load
if (!nervana_loadKernels(nervana_path != nullptr? nervana_path: TOSTRING(NERVANA_PATH))){
std::cerr << "Couldn't load all kernels" << std::endl;
exit(1);
}
//init cublas handle
cublasHandle_t cublas_state;
cublasCreate(&cublas_state);
cublasSetStream(cublas_state, nullptr);
float *d_a, *d_b, *d_c;
float *h_a, *h_b, *h_c;
cudaMalloc(&d_a, sizeof(float) * M * K);
cudaMalloc(&d_b, sizeof(float) * K * N);
cudaMalloc(&d_c, sizeof(float) * M * N);
h_a = (float *)malloc(sizeof(float) * M * K);
h_b = (float *)malloc(sizeof(float) * K * N);
h_c = (float *)malloc(sizeof(float) * M * N);
//init
std::cerr<<"matrix a :"<<std::endl;
for(int i=0; i<M; ++i){
for(int j=0; j<K; ++j){
h_a[i*K + j] = i*K + j;
}
}
std::cerr<<"matrix b :"<<std::endl;
for(int i=0; i<K; ++i){
for(int j=0; j<N; ++j){
h_b[i*N + j] = i*N + j ;
}
}
for(int i=0; i<M; ++i){
for(int j=0; j<N; ++j){
h_c[i*N + j] = 0.0 ;
}
}
cudaMemcpy(d_a, h_a, sizeof(float) * M * K, H2D);
cudaMemcpy(d_b, h_b, sizeof(float) * K * N, H2D);
cudaMemcpy(d_c, h_c, sizeof(float) * M * N, H2D);
int test_num = 1000;
//nervana gemm,
for(int t=0; t< test_num; ++t){
test_nervana(d_a, d_b, d_c, M, N, K);
cudaDeviceSynchronize();
}
//cublas gemm
for(int t=0; t< test_num; ++t){
test_cublas(cublas_state,d_a, d_b, d_c, M, N, K);
cudaDeviceSynchronize();
}
cudaMemcpy(h_c, d_c, sizeof(float) * M * N, D2H);
//nervana unload
if (!nervana_unloadKernels()){
std::cout << "unload kernel failed" << endl;
exit(-1);
}
//cublas destory
cublasDestroy(cublas_state);
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(h_c);
return 0;
}
//col-major
int test_cublas(cublasHandle_t cublas_state, float *A ,float *B,float *C, int m, int n, int k)
{
float alpha = 1.0;
float beta = 0.0;
cublasSgemm (cublas_state,CUBLAS_OP_N, CUBLAS_OP_N,
n,m,k,&alpha,
B,n,
A,k,
&beta,
C,n);
return 0;
}
//row-major
int test_nervana(float *A ,float *B,float *C, int m, int n, int k)
{
if(!nervana_sgemm(A, B, C,
false, false,
m, n, k,
k, n, n,
1.0, 0.0,
NULL,
false, false,
0)){
std::cerr<<"nervana_sgemm run error!!! "<<std::endl;
}
return 0;
}
Makefile文件:
OBJ = test_nervana_vs_cublas
CC = nvcc
FLAGS = -gencode=arch=compute_50,code=sm_50 -std=c++11 -DNERVANA_PATH="../nervana/cubin/"
INCLUDE_DIR = -I../
LIBS = ../libnervana.a -lcuda -lcublas
all:
$(CC) $(FLAGS) test_nervana_vs_cublas.cpp -o $(OBJ) $(INCLUDE_DIR) $(LIBS)
clean:
rm -rf $(OBJ)
测试结果:
1、测试平台GPU Quadro K1200
2、改变M的值,测试两者性能,如下图:
3、测试结果说明:
- 1、cublas:当M 不能被32整除时,cublas的性能急剧下降。
- 2、nevana 性能优于cublas
注意:
1、cublas的数据是列优先存储
2、nervana的数据是行优先存储