让Faster R-CNN支持TX1的fp16(half float, float16)特性

最新推荐文章于 2024-06-22 16:11:29 发布

置顶 LeeJiajun

最新推荐文章于 2024-06-22 16:11:29 发布

阅读量8.8k

点赞数 2

分类专栏：研发图像识别深度学习文章标签： nvidia TX1

本文链接：https://blog.csdn.net/jiajunlee/article/details/50733917

版权

深度学习同时被 3 个专栏收录

27 篇文章 1 订阅

订阅专栏

图像识别

17 篇文章 0 订阅

订阅专栏

研发

12 篇文章 0 订阅

订阅专栏

为什么要用float16？

一句话，float16的运算速度是float32运算速度的2倍。nVidia说的，数据不对不要怪我，(≖ ‿ ≖)✧

在说一下缺点，也是一句话，精度损失了。（卧槽，废话，(*´Д｀*)）

会损失多少呢？

IEEE754（wiki）这里描述各种float的规则，这里鄙视一下百度百科。 눈_눈

float32: 负 -3.4028235E+38 到 -1.401298E-45，正 1.401298E-45 到 3.4028235E+38

float16: 半精度占2个字节，1位符号位，5位阶码，10位尾数组成，精度为0.001，所以它的范围，自己算，(´・ω・`) ，不会就去问老师(｡◕ˇ∀ˇ◕）。

看来还是可以和float32比一比的嘛，（喂喂，这么果断得出结论真的好吗？）

既然那么厉害，反正拿到了TX1，那就试试呀，（卧槽，真的不打算解释了么）！搞一个超大的矩阵乘，然而事实是cuda目前也只提供float16的矩阵乘运算，_(:3 」∠)_

先贴个代码，请直接略过，代码有点长，有点乱，请随便介意，因为不是我写的，(｡◕ˇ∀ˇ◕）

#include <algorithm>
#include <iostream>
#include <time.h>
#include <cublas.h>
#include <cublas_v2.h>
#include <assert.h>
#include <stdio.h>
#include <cuda_fp16.h>
using namespace std;


#define  IDX2C(i,j,leading) (((j)*(leading))+(i))


typedef struct _data *PDATA;
typedef struct _data
{
    int _rows;
    int _cols;
    float *data;
} Data;

typedef struct _hdata *PHDATA;
typedef struct _hdata
{
    int _rows;
    int _cols;
    half *data;
} HData;
void free_mat(PDATA mat)
{
    free(mat->data);
    free(mat);
}
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;

half uint16_as_fp16 (uint16_t a)
{
    half res;
#if defined (__cplusplus)
    memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
    volatile union {
        half f;
        uint16_t i;
    } cvt;
    cvt.i = a;
    res = cvt.f;
#endif /* __cplusplus */
    return res;
}

uint32_t fp32_as_uint32 (float a)
{
    uint32_t res;
#if defined (__cplusplus)
    memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
    volatile union {
        float f;
        uint32_t i;
    } cvt;
    cvt.f = a;
    res = cvt.i;
#endif /* __cplusplus */
    return res;
}

/* host version of device function __float2half_rn() */
half float2half_rn (float a)
{
    uint32_t ia = fp32_as_uint32 (a);
    uint16_t ir;

    ir = (ia >> 16) & 0x8000;
    if ((ia & 0x7f800000) == 0x7f800000) {
        if ((ia & 0x7fffffff) == 0x7f800000) {
            ir |= 0x7c00; /* infinity */
        } else {
            ir = 0x7fff; /* canonical NaN */
        }
    } else if ((ia & 0x7f800000) >= 0x33000000) {
        int shift = (int)((ia >> 23) & 0xff) - 127;
        if (shift > 15) {
            ir |= 0x7c00; /* infinity */
        } else {
            ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
            if (shift < -14) { /* denormal */  
                ir |= ia >> (-1 - shift);
                ia = ia << (32 - (-1 - shift));
            } else { /* normal */
                ir |= ia >> (24 - 11);
                ia = ia << (32 - (24 - 11));
                ir = ir + ((14 + shift) << 10);
            }
            /* IEEE-754 round to nearest of even */
            if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
                ir++;
            }
        }
    }
    return uint16_as_fp16 (ir);
}

PHDATA mat_product(PHDATA mat1,PHDATA mat2)
{
    if(mat1->_cols!=mat2->_rows)
    {
        printf("this is not right\n");
            return NULL;
    }
    PHDATA mat3=new HData;
    mat3->data=(half *)malloc(sizeof(half)*(mat1->_rows)*(mat2->_cols));
    mat3->_rows=mat1->_rows;
    mat3->_cols=mat2->_cols;
    /*
     *INIT the matrix we want calculate 
     * col primary
     */
    {
        half *d_a,*d_b,*d_c;
        cublasInit();
        cublasAlloc((mat1->_cols)*(mat1->_rows),sizeof(half),(void **)&d_a);
        cublasAlloc((mat2->_cols)*(mat2->_rows),sizeof(half),(void **)&d_b);
        cublasAlloc((mat3->_rows)*(mat3->_cols),sizeof(half),(void **)&d_c);
        cudaMemcpy(d_a,mat1->data,sizeof(half)*(mat1->_cols)*(mat1->_rows),cudaMemcpyHostToDevice);
        cudaMemcpy(d_b,mat2->data,sizeof(half)*(mat2->_rows)*(mat2->_cols),cudaMemcpyHostToDevice);
        cublasHandle_t handle;
        cublasCreate(&handle);
        half alpha=float2half_rn(float(1.0));
        half beta=float2half_rn(float(0.0));
        cudaEvent_t start,stop;
 <span style="white-space:pre">	</span>cudaEventCreate(&start);
 <span style="white-space:pre">	</span>cudaEventCreate(&stop);
 <span style="white-space:pre">	</span>cudaEventRecord(start,0);
        cublasHgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,mat1->_rows,mat2->_cols,
            mat2->_rows,&alpha,d_a,mat1->_rows,d_b,mat2->_rows,&beta,d_c,mat1->_rows);
 <span style="white-space:pre">	</span>cudaEventRecord(stop,0);
 <span style="white-space:pre">	</span>cudaEventSynchronize(stop);

 <span style="white-space:pre">	</span>float ela=0;
 <span style="white-space:pre">	</span>cudaEventElapsedTime(&ela,start,stop);
 <span style="white-space:pre">	</span>cout<<"GPU: "<<ela<<"ms"<<endl;
        cudaMemcpy(mat3->data,d_c,sizeof(half)*(mat3->_rows)*(mat3->_cols),cudaMemcpyDeviceToHost);
        cublasFree(d_a);
        cublasFree(d_b);
        cublasFree(d_c);
        cublasShutdown();
    }
    /* need to trans the mat3*/
    return mat3;
}

void ele_mat_show(PDATA mat)
{
    for (int i=0;i<mat->_rows;i++){
        for (int j=0;j<mat->_cols;j++){
            cout<<mat->data[IDX2C(i,j,mat->_rows)]<<"\t";
        }
        cout<<endl;
    }
}
float myrand()
{
    return rand()%10;
}
int main()
{
    //clock_t start,end;

#if 0
    for (int i=0;i<M*N;i++)
    {
        cout<<c[i]<<"\t";
    }
    cout<<endl;
#endif

    PDATA mat1,mat2,mat3;
    PHDATA mat4,mat5,mat6;
    /* remember to initialize the point*/
    mat1=(PDATA)malloc(sizeof(Data));
    mat2=(PDATA)malloc(sizeof(Data));
    mat3=(PDATA)malloc(sizeof(Data));
    mat4=(PHDATA)malloc(sizeof(HData));
    mat5=(PHDATA)malloc(sizeof(HData));
    mat6=(PHDATA)malloc(sizeof(HData));
    
    mat1->_rows=5000;
    mat1->_cols=50000;
    mat4->_rows=5000;
    mat4->_cols=50000;
    mat1->data=(float *)malloc(sizeof(float)*mat1->_rows*mat1->_cols);
    mat4->data=(half *)malloc(sizeof(half)*mat1->_rows*mat1->_cols);
    for (int i=0;i<mat1->_rows;i++)
        for (int j=0;j<mat1->_cols;j++)
            mat1->data[IDX2C(i,j,mat1->_rows)]=i+j;
    for (int i=0;i<mat1->_rows;i++)
        for (int j=0;j<mat1->_cols;j++)
            mat4->data[IDX2C(i,j,mat1->_rows)]=float2half_rn(mat1->data[IDX2C(i,j,mat1->_rows)]);

    mat2->_rows=50000;
    mat2->_cols=2000;
    mat5->_rows=50000;
    mat5->_cols=2000;
    mat2->data=(float *)malloc(sizeof(float)*mat2->_rows*mat2->_cols);
    mat5->data=(half *)malloc(sizeof(half)*mat2->_rows*mat2->_cols);
    for (int i=0;i<mat2->_rows;i++)
        for (int j=0;j<mat2->_cols;j++)
            mat2->data[IDX2C(i,j,mat2->_rows)]=i+j;
    for (int i=0;i<mat2->_rows;i++)
        for (int j=0;j<mat2->_cols;j++)
            mat5->data[IDX2C(i,j,mat2->_rows)]=float2half_rn(mat2->data[IDX2C(i,j,mat2->_rows)]);

    mat6=mat_product(mat4,mat5);

    return 0;
}

漫长的等待之后，跑了完了，结果是什么呢？ (´・ω・`)

我擦咧， (╯‵□′)╯︵┻━┻还真尼玛快了两倍，nVidia你赢了。

那如此一来，就得开始干活啦，让faster rcnn支持TX1的半精度运算。