让Faster R-CNN支持TX1的fp16(half float, float16)特性

17 篇文章 0 订阅
12 篇文章 0 订阅


为什么要用float16?

一句话,float16的运算速度是float32运算速度的2倍。nVidia说的,数据不对不要怪我,(≖ ‿ ≖)✧

在说一下缺点,也是一句话,精度损失了。(卧槽,废话,(*´Д`*)


会损失多少呢?

IEEE754(wiki)这里描述各种float的规则,这里鄙视一下百度百科。 눈_눈

float32: 负 -3.4028235E+38 到 -1.401298E-45,正 1.401298E-45 到 3.4028235E+38

float16: 半精度占2个字节,1位符号位,5位阶码,10位尾数组成,精度为0.001,所以它的范围,自己算 ,(´・ω・`) ,不会就去问老师(。◕ˇ∀ˇ◕)。

看来还是可以和float32比一比的嘛,(喂喂,这么果断得出结论真的好吗?)

既然那么厉害, 反正拿到了TX1,那就试试呀,(卧槽,真的不打算解释了么)!搞一个超大的矩阵乘,然而事实是cuda目前也只提供float16的矩阵乘运算,_(:3 」∠)_

先贴个代码,请直接略过,代码有点长,有点乱,请随便介意,因为不是我写的,(。◕ˇ∀ˇ◕)

#include <algorithm>
#include <iostream>
#include <time.h>
#include <cublas.h>
#include <cublas_v2.h>
#include <assert.h>
#include <stdio.h>
#include <cuda_fp16.h>
using namespace std;


#define  IDX2C(i,j,leading) (((j)*(leading))+(i))


typedef struct _data *PDATA;
typedef struct _data
{
    int _rows;
    int _cols;
    float *data;
} Data;

typedef struct _hdata *PHDATA;
typedef struct _hdata
{
    int _rows;
    int _cols;
    half *data;
} HData;
void free_mat(PDATA mat)
{
    free(mat->data);
    free(mat);
}
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;

half uint16_as_fp16 (uint16_t a)
{
    half res;
#if defined (__cplusplus)
    memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
    volatile union {
        half f;
        uint16_t i;
    } cvt;
    cvt.i = a;
    res = cvt.f;
#endif /* __cplusplus */
    return res;
}

uint32_t fp32_as_uint32 (float a)
{
    uint32_t res;
#if defined (__cplusplus)
    memcpy (&res, &a, sizeof (res));
#else /* __cplusplus */
    volatile union {
        float f;
        uint32_t i;
    } cvt;
    cvt.f = a;
    res = cvt.i;
#endif /* __cplusplus */
    return res;
}

/* host version of device function __float2half_rn() */
half float2half_rn (float a)
{
    uint32_t ia = fp32_as_uint32 (a);
    uint16_t ir;

    ir = (ia >> 16) & 0x8000;
    if ((ia & 0x7f800000) == 0x7f800000) {
        if ((ia & 0x7fffffff) == 0x7f800000) {
            ir |= 0x7c00; /* infinity */
        } else {
            ir = 0x7fff; /* canonical NaN */
        }
    } else if ((ia & 0x7f800000) >= 0x33000000) {
        int shift = (int)((ia >> 23) & 0xff) - 127;
        if (shift > 15) {
            ir |= 0x7c00; /* infinity */
        } else {
            ia = (ia & 0x007fffff) | 0x00800000; /* extract mantissa */
            if (shift < -14) { /* denormal */  
                ir |= ia >> (-1 - shift);
                ia = ia << (32 - (-1 - shift));
            } else { /* normal */
                ir |= ia >> (24 - 11);
                ia = ia << (32 - (24 - 11));
                ir = ir + ((14 + shift) << 10);
            }
            /* IEEE-754 round to nearest of even */
            if ((ia > 0x80000000) || ((ia == 0x80000000) && (ir & 1))) {
                ir++;
            }
        }
    }
    return uint16_as_fp16 (ir);
}

PHDATA mat_product(PHDATA mat1,PHDATA mat2)
{
    if(mat1->_cols!=mat2->_rows)
    {
        printf("this is not right\n");
            return NULL;
    }
    PHDATA mat3=new HData;
    mat3->data=(half *)malloc(sizeof(half)*(mat1->_rows)*(mat2->_cols));
    mat3->_rows=mat1->_rows;
    mat3->_cols=mat2->_cols;
    /*
     *INIT the matrix we want calculate 
     * col primary
     */
    {
        half *d_a,*d_b,*d_c;
        cublasInit();
        cublasAlloc((mat1->_cols)*(mat1->_rows),sizeof(half),(void **)&d_a);
        cublasAlloc((mat2->_cols)*(mat2->_rows),sizeof(half),(void **)&d_b);
        cublasAlloc((mat3->_rows)*(mat3->_cols),sizeof(half),(void **)&d_c);
        cudaMemcpy(d_a,mat1->data,sizeof(half)*(mat1->_cols)*(mat1->_rows),cudaMemcpyHostToDevice);
        cudaMemcpy(d_b,mat2->data,sizeof(half)*(mat2->_rows)*(mat2->_cols),cudaMemcpyHostToDevice);
        cublasHandle_t handle;
        cublasCreate(&handle);
        half alpha=float2half_rn(float(1.0));
        half beta=float2half_rn(float(0.0));
        cudaEvent_t start,stop;
 <span style="white-space:pre">	</span>cudaEventCreate(&start);
 <span style="white-space:pre">	</span>cudaEventCreate(&stop);
 <span style="white-space:pre">	</span>cudaEventRecord(start,0);
        cublasHgemm(handle,CUBLAS_OP_N,CUBLAS_OP_N,mat1->_rows,mat2->_cols,
            mat2->_rows,&alpha,d_a,mat1->_rows,d_b,mat2->_rows,&beta,d_c,mat1->_rows);
 <span style="white-space:pre">	</span>cudaEventRecord(stop,0);
 <span style="white-space:pre">	</span>cudaEventSynchronize(stop);

 <span style="white-space:pre">	</span>float ela=0;
 <span style="white-space:pre">	</span>cudaEventElapsedTime(&ela,start,stop);
 <span style="white-space:pre">	</span>cout<<"GPU: "<<ela<<"ms"<<endl;
        cudaMemcpy(mat3->data,d_c,sizeof(half)*(mat3->_rows)*(mat3->_cols),cudaMemcpyDeviceToHost);
        cublasFree(d_a);
        cublasFree(d_b);
        cublasFree(d_c);
        cublasShutdown();
    }
    /* need to trans the mat3*/
    return mat3;
}

void ele_mat_show(PDATA mat)
{
    for (int i=0;i<mat->_rows;i++){
        for (int j=0;j<mat->_cols;j++){
            cout<<mat->data[IDX2C(i,j,mat->_rows)]<<"\t";
        }
        cout<<endl;
    }
}
float myrand()
{
    return rand()%10;
}
int main()
{
    //clock_t start,end;

#if 0
    for (int i=0;i<M*N;i++)
    {
        cout<<c[i]<<"\t";
    }
    cout<<endl;
#endif

    PDATA mat1,mat2,mat3;
    PHDATA mat4,mat5,mat6;
    /* remember to initialize the point*/
    mat1=(PDATA)malloc(sizeof(Data));
    mat2=(PDATA)malloc(sizeof(Data));
    mat3=(PDATA)malloc(sizeof(Data));
    mat4=(PHDATA)malloc(sizeof(HData));
    mat5=(PHDATA)malloc(sizeof(HData));
    mat6=(PHDATA)malloc(sizeof(HData));
    
    mat1->_rows=5000;
    mat1->_cols=50000;
    mat4->_rows=5000;
    mat4->_cols=50000;
    mat1->data=(float *)malloc(sizeof(float)*mat1->_rows*mat1->_cols);
    mat4->data=(half *)malloc(sizeof(half)*mat1->_rows*mat1->_cols);
    for (int i=0;i<mat1->_rows;i++)
        for (int j=0;j<mat1->_cols;j++)
            mat1->data[IDX2C(i,j,mat1->_rows)]=i+j;
    for (int i=0;i<mat1->_rows;i++)
        for (int j=0;j<mat1->_cols;j++)
            mat4->data[IDX2C(i,j,mat1->_rows)]=float2half_rn(mat1->data[IDX2C(i,j,mat1->_rows)]);

    mat2->_rows=50000;
    mat2->_cols=2000;
    mat5->_rows=50000;
    mat5->_cols=2000;
    mat2->data=(float *)malloc(sizeof(float)*mat2->_rows*mat2->_cols);
    mat5->data=(half *)malloc(sizeof(half)*mat2->_rows*mat2->_cols);
    for (int i=0;i<mat2->_rows;i++)
        for (int j=0;j<mat2->_cols;j++)
            mat2->data[IDX2C(i,j,mat2->_rows)]=i+j;
    for (int i=0;i<mat2->_rows;i++)
        for (int j=0;j<mat2->_cols;j++)
            mat5->data[IDX2C(i,j,mat2->_rows)]=float2half_rn(mat2->data[IDX2C(i,j,mat2->_rows)]);

    mat6=mat_product(mat4,mat5);

    return 0;
}


漫长的等待之后,跑了完了, 结果是什么呢? (´・ω・`) 

我擦咧, (╯‵□′)╯︵┻━┻还真尼玛快了两倍,nVidia你赢了。

那如此一来,就得开始干活啦,让faster rcnn支持TX1的半精度运算。


怎么做

1.环境:cuda7.5,python,TX1等等
2.需要caffe支持半精运算
3.添加faster rcnn新增的层:roi_pooling_layer和smooth_L1_loss_layer
4.测试(做好后将会在我的博客中公布结果)

目前情况

1.Faster R-CNN在TX1上运行。
2.让mnist在TX1上跑半精运算。
3.正在重写 roi_pooling_layer和 smooth_L1_loss_layer

相关链接



  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 18
    评论
评论 18
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值