

                                                             f(x)=\begin{cases} x & \text{ if } x>=0 \\ -x & \text{ if } x<0 \end{cases}


// 前向在位传播
int AbsVal::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    // 输入的width,height和channels
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    // 并行计算
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        // 指向第q个channel
        float* ptr =;

        // 遍历每一个元素,对每个元素取绝对值
        for (int i=0; i<size; i++)
            if (ptr[i] < 0)
                ptr[i] = -ptr[i];

    return 0;



// 前向传播
int ArgMax::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
    int size =;

    // 默认(topk, 1)
    // 如果是top-1,获取最大值及序号
    // 否则,只获取序号
    if (out_max_val)
        top_blob.create(topk, 2, 4u, opt.blob_allocator);
        top_blob.create(topk, 1, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    // 输入的指针
    const float* ptr = bottom_blob;

    // partial sort topk with index
    // optional value
    std::vector< std::pair<float, int> > vec;
    // 绑定对应索引
    for (int i=0; i<size; i++)
        vec[i] = std::make_pair(ptr[i], i);

    // 做一个排序
    std::partial_sort(vec.begin(), vec.begin() + topk, vec.end(),
                        std::greater< std::pair<float, int> >());

    float* outptr = top_blob;
    if (out_max_val)
        float* valptr = outptr + topk;
        // 获取值及序号
        for (int i=0; i<topk; i++)
            outptr[i] = vec[i].first;
            valptr[i] = vec[i].second;
        // 获取序列
        for (int i=0; i<topk; i++)
            outptr[i] = vec[i].second;

    return 0;


    batch normalization的原因是为了缓解Internal Covariate Shift:在深层网络训练的过程中,由于网络中参数变化而引起内部结点数据分布发生变化的过程。

    对每个特征进行独立的normalization,使得第l层每个特征的输入的分布都是0均值,方差为1,缓解了ICS,但是让每一层网络的输入数据分布都变得稳定,但却导致了数据表达能力的缺失。也就是我们通过变换操作改变了原有数据的信息表达(representation ability of the network),使得底层网络学习到的参数信息丢失。另一方面,通过让每一层的输入分布均值为0,方差为1,会使得输入在经过sigmoid或tanh激活函数时,容易陷入非线性激活函数的线性区域。(摘自参考资料[2])因此,BN引入两个可学习的参数,对规范化后的数据进行线性变换,恢复数据本身的表达能力:


// 载入模型
int BatchNorm::load_model(const ModelBin& mb)
    // slope数据
    slope_data = mb.load(channels, 1);
    // 载入失败:返还-100
    if (slope_data.empty())
        return -100;

    // mean数据
    mean_data = mb.load(channels, 1);
    // 载入数据失败,返还-100
    if (mean_data.empty())
        return -100;

    // variance数据
    var_data = mb.load(channels, 1);
    // 载入数据失败,返还-100
    if (var_data.empty())
        return -100;

    // bias数据
    bias_data = mb.load(channels, 1);
    // 载入数据失败,返还-100
    if (bias_data.empty())
        return -100;

    // 创建矩阵
    if (a_data.empty())
        return -100;
    // 创建矩阵
    if (b_data.empty())
        return -100;

    for (int i=0; i<channels; i++)
        // sqrt variance
        float sqrt_var = sqrt(var_data[i] + eps);
        a_data[i] = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var;
        b_data[i] = slope_data[i] / sqrt_var;

    return 0;

// 前向传播
int BatchNorm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    // a = bias - slope * mean / sqrt(var)
    // b = slope / sqrt(var)
    // value = b * value + a

    // 自下而上的blob维度
    int dims = bottom_top_blob.dims;

    // 维度为1
    if (dims == 1)
        // 自下而上的权值
        int w = bottom_top_blob.w;

        // 自下而上blob的指针
        float* ptr = bottom_top_blob;

        // 更新blob的值
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=0; i<w; i++)
            ptr[i] = b_data[i] * ptr[i] + a_data[i];

    if (dims == 2)
        // 自下而上blob的size
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;

        // 对于二维blob的数据更新
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=0; i<h; i++)
            float* ptr = bottom_top_blob.row(i);
            float a = a_data[i];
            float b = b_data[i];

            for (int j=0; j<w; j++)
                ptr[j] = b * ptr[j] + a;

    // 对于三维的blob数据更新
    if (dims == 3)
        // 自下而上blob的size
        int w = bottom_top_blob.w;
        int h = bottom_top_blob.h;
        int size = w * h;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
            float* ptr =;
            float a = a_data[q];
            float b = b_data[q];

            for (int i=0; i<size; i++)
                ptr[i] = b * ptr[i] + a;

    return 0;



// 在位前向传播
int Bias::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    // 自下而上blob的size
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    // 前向操作就是一个加上bias的过程
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        float* ptr =;

        float bias = bias_data[q];

        for (int i=0; i<size; i++)
            ptr[i] += bias;

    return 0;


5.二项正态对数似然(binomial normal log likelihood):BNLL

                                                              f(x) =log(1+e^{x})


int BNLL::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        float* ptr =;

        for (int i=0; i<size; i++)
            if (ptr[i] > 0)
                ptr[i] = ptr[i] + log(1.f + exp(-ptr[i]));
                ptr[i] = log(1.f + exp(ptr[i]));

    return 0;


int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
    // 不需要进行类型转换
    if (type_from == type_to)
        top_blob = bottom_blob;
        return 0;

    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    int dims = bottom_blob.dims;
    size_t elemsize = bottom_blob.elemsize;
    int elempack = bottom_blob.elempack;

    size_t out_elemsize = elemsize;
    if (type_to == 1)
        // float32
        out_elemsize = 4 * elempack;
    else if (type_to == 2)
        // float16
        out_elemsize = 2 * elempack;
    else if (type_to == 3)
        // int8
        out_elemsize = elempack;

    // 为输出分配内存
    if (dims == 1)
        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
    else if (dims == 2)
        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
    else if (dims == 3)
        top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    int size = w * h * elempack;

    // float32转float16
    if (type_from == 1 && type_to == 2)
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
            const float* ptr =;
            unsigned short* outptr =;

            for (int i=0; i<size; i++)
                outptr[i] = float32_to_float16(ptr[i]);

    // float16转float32
    if (type_from == 2 && type_to == 1)
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
            const unsigned short* ptr =;
            float* outptr =;

            for (int i=0; i<size; i++)
                outptr[i] = float16_to_float32(ptr[i]);

    // TODO more cast type

    return 0;


int Clip::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        float* ptr =;
        // 对输入进行截断操作
        for (int i=0; i<size; i++)
            if (ptr[i] < min)
                ptr[i] = min;
            if (ptr[i] > max)
                ptr[i] = max;

    return 0;



// 前向传播:数据拼接
int Concat::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
    // 输入的blobs维度
    int dims = bottom_blobs[0].dims;
    // 每个blob的元素大小
    size_t elemsize = bottom_blobs[0].elemsize;

    // 如果是一维的
    if (dims == 1) // axis == 0
        // concat vector
        // total length
        int top_w = 0;
        // 计算拼接后向量大小
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;

        // 创建输出的Mat
        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        // 进行concat操作:将输入的数据memcopy到输出
        float* outptr = top_blob;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];

            int w = bottom_blob.w;

            const float* ptr = bottom_blob;
            memcpy(outptr, ptr, w * elemsize);

            outptr += w;

        return 0;

    // 如果输入blob为二维,设置concat的轴向为height方向
    // 保持width不变,对所有mat进行拼接
    if (dims == 2 && axis == 0)
        // concat image
        int w = bottom_blobs[0].w;

        // total height
        // 计算拼接后height长度
        int top_h = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;

        // 输出Mat
        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        // 使用memcpy的方式将输入blobs数据拷贝到对应concat之后mat中
        float* outptr = top_blob;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];

            int size = w * bottom_blob.h;

            const float* ptr = bottom_blob;
            memcpy(outptr, ptr, size * elemsize);

            outptr += size;

        return 0;

    // 如果从y轴方向进行concat:和上面类似,沿着width方向展开
    if (dims == 2 && axis == 1)
        // interleave image row
        int h = bottom_blobs[0].h;

        // total width
        int top_w = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=0; i<h; i++)
            float* outptr = top_blob.row(i);
            for (size_t b=0; b<bottom_blobs.size(); b++)
                const Mat& bottom_blob = bottom_blobs[b];

                const float* ptr = bottom_blob.row(i);
                memcpy(outptr, ptr, bottom_blob.w * elemsize);

                outptr += bottom_blob.w;

        return 0;

    // 如果维度为3,沿着channel方向进行拼接
    if (dims == 3 && axis == 0)
        // concat dim
        int w = bottom_blobs[0].w;
        int h = bottom_blobs[0].h;

        // total channels
        // 计算拼接后总的channel数目
        int top_channels = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];
            top_channels += bottom_blob.c;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, h, top_channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        int q = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];

            int channels = bottom_blob.c;
            int size = bottom_blob.cstep * channels;

            const float* ptr = bottom_blob;
            float* outptr =;
            memcpy(outptr, ptr, size * elemsize);

            q += channels;

        return 0;

    // 沿着height方向展开
    if (dims == 3 && axis == 1)
        // interleave dim height
        int w = bottom_blobs[0].w;
        int channels = bottom_blobs[0].c;

        // total height
        // 计算拼接后总的height长度
        int top_h = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];
            top_h += bottom_blob.h;

        Mat& top_blob = top_blobs[0];
        top_blob.create(w, top_h, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
            float* outptr =;

            for (size_t b=0; b<bottom_blobs.size(); b++)
                const Mat& bottom_blob = bottom_blobs[b];

                int size = bottom_blob.w * bottom_blob.h;

                const float* ptr =;
                memcpy(outptr, ptr, size * elemsize);

                outptr += size;

        return 0;

    // 沿着width方向展开
    if (dims == 3 && axis == 2)
        // interleave dim width
        int h = bottom_blobs[0].h;
        int channels = bottom_blobs[0].c;

        // total width
        // 计算拼接后总的width
        int top_w = 0;
        for (size_t b=0; b<bottom_blobs.size(); b++)
            const Mat& bottom_blob = bottom_blobs[b];
            top_w += bottom_blob.w;

        Mat& top_blob = top_blobs[0];
        top_blob.create(top_w, h, channels, elemsize, opt.blob_allocator);
        if (top_blob.empty())
            return -100;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
            float* outptr =;

            for (int i=0; i<h; i++)
                for (size_t b=0; b<bottom_blobs.size(); b++)
                    const Mat& bottom_blob = bottom_blobs[b];

                    const float* ptr =;
                    memcpy(outptr, ptr, bottom_blob.w * elemsize);

                    outptr += bottom_blob.w;

        return 0;

    return 0;

9. 目标检测输出:DetectionOutput


template <typename T>
static void qsort_descent_inplace(std::vector<T>& datas, std::vector<float>& scores, int left, int right)
    int i = left;
    int j = right;
    float p = scores[(left + right) / 2];

    while (i <= j)
        while (scores[i] > p)

        while (scores[j] < p)

        if (i <= j)
            // swap
            std::swap(datas[i], datas[j]);
            std::swap(scores[i], scores[j]);


    if (left < j)
        qsort_descent_inplace(datas, scores, left, j);

    if (i < right)
        qsort_descent_inplace(datas, scores, i, right);


static void nms_sorted_bboxes(const std::vector<BBoxRect>& bboxes, std::vector<int>& picked, float nms_threshold)

    const int n = bboxes.size();

    std::vector<float> areas(n);
    for (int i = 0; i < n; i++)
        const BBoxRect& r = bboxes[i];

        float width = r.xmax - r.xmin;
        float height = r.ymax - r.ymin;

        areas[i] = width * height;

    for (int i = 0; i < n; i++)
        const BBoxRect& a = bboxes[i];

        int keep = 1;
        for (int j = 0; j < (int)picked.size(); j++)
            const BBoxRect& b = bboxes[picked[j]];

            // intersection over union
            float inter_area = intersection_area(a, b);
            float union_area = areas[i] + areas[picked[j]] - inter_area;
//             float IoU = inter_area / union_area
            if (inter_area / union_area > nms_threshold)
                keep = 0;

        if (keep)


int DetectionOutput::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
    const Mat& location = bottom_blobs[0];
    const Mat& confidence = bottom_blobs[1];
    const Mat& priorbox = bottom_blobs[2];

    bool mxnet_ssd_style = num_class == -233;

    // mxnet-ssd _contrib_MultiBoxDetection
    const int num_prior = mxnet_ssd_style ? priorbox.h : priorbox.w / 4;

    int num_class_copy = mxnet_ssd_style ? confidence.h : num_class;

    // apply location with priorbox
    Mat bboxes;
    bboxes.create(4, num_prior, 4u, opt.workspace_allocator);
    if (bboxes.empty())
        return -100;

    const float* location_ptr = location;
    const float* priorbox_ptr = priorbox.row(0);
    const float* variance_ptr = mxnet_ssd_style ? 0 : priorbox.row(1);

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 0; i < num_prior; i++)
        const float* loc = location_ptr + i * 4;
        const float* pb = priorbox_ptr + i * 4;
        const float* var = variance_ptr ? variance_ptr + i * 4 : variances;

        float* bbox = bboxes.row(i);

        // CENTER_SIZE
        float pb_w = pb[2] - pb[0];
        float pb_h = pb[3] - pb[1];
        float pb_cx = (pb[0] + pb[2]) * 0.5f;
        float pb_cy = (pb[1] + pb[3]) * 0.5f;

        float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
        float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
        float bbox_w = exp(var[2] * loc[2]) * pb_w;
        float bbox_h = exp(var[3] * loc[3]) * pb_h;

        bbox[0] = bbox_cx - bbox_w * 0.5f;
        bbox[1] = bbox_cy - bbox_h * 0.5f;
        bbox[2] = bbox_cx + bbox_w * 0.5f;
        bbox[3] = bbox_cy + bbox_h * 0.5f;

    // sort and nms for each class
    std::vector< std::vector<BBoxRect> > all_class_bbox_rects;
    std::vector< std::vector<float> > all_class_bbox_scores;

    // start from 1 to ignore background class
    #pragma omp parallel for num_threads(opt.num_threads)
    for (int i = 1; i < num_class_copy; i++)
        // filter by confidence_threshold
        std::vector<BBoxRect> class_bbox_rects;
        std::vector<float> class_bbox_scores;

        for (int j = 0; j < num_prior; j++)
            // prob data layout
            // caffe-ssd = num_class x num_prior
            // mxnet-ssd = num_prior x num_class
            float score = mxnet_ssd_style ? confidence[i * num_prior + j] : confidence[j * num_class_copy + i];

            if (score > confidence_threshold)
                const float* bbox = bboxes.row(j);
                BBoxRect c = { bbox[0], bbox[1], bbox[2], bbox[3], i };

        // sort inplace
        qsort_descent_inplace(class_bbox_rects, class_bbox_scores);

        // keep nms_top_k
        if (nms_top_k < (int)class_bbox_rects.size())

        // apply nms
        std::vector<int> picked;
        nms_sorted_bboxes(class_bbox_rects, picked, nms_threshold);

        // select
        for (int j = 0; j < (int)picked.size(); j++)
            int z = picked[j];

    // gather all class
    std::vector<BBoxRect> bbox_rects;
    std::vector<float> bbox_scores;

    for (int i = 1; i < num_class_copy; i++)
        const std::vector<BBoxRect>& class_bbox_rects = all_class_bbox_rects[i];
        const std::vector<float>& class_bbox_scores = all_class_bbox_scores[i];

        bbox_rects.insert(bbox_rects.end(), class_bbox_rects.begin(), class_bbox_rects.end());
        bbox_scores.insert(bbox_scores.end(), class_bbox_scores.begin(), class_bbox_scores.end());

    // global sort inplace
    qsort_descent_inplace(bbox_rects, bbox_scores);

    // keep_top_k
    if (keep_top_k < (int)bbox_rects.size())

    // fill result
    int num_detected = bbox_rects.size();
    if (num_detected == 0)
        return 0;

    Mat& top_blob = top_blobs[0];
    top_blob.create(6, num_detected, 4u, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    for (int i = 0; i < num_detected; i++)
        const BBoxRect& r = bbox_rects[i];
        float score = bbox_scores[i];
        float* outptr = top_blob.row(i);

        outptr[0] = r.label;
        outptr[1] = score;
        outptr[2] = r.xmin;
        outptr[3] = r.ymin;
        outptr[4] = r.xmax;
        outptr[5] = r.ymax;

    return 0;



int Dropout::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    if (scale == 1.f)
        return 0;

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        float* ptr =;

        for (int i=0; i<size; i++)
            ptr[i] = ptr[i] * scale;

    return 0;



                                                  f(x)= \left\{\begin{matrix} x & if\ x > 0\\ \ \alpha *(e^{x}-1) &if\ x\leq 0 \end{matrix}\right.


int ELU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        float* ptr =;

        // 对小于0的部分处理
        for (int i=0; i<size; i++)
            if (ptr[i] < 0.f)
                ptr[i] = alpha * (exp(ptr[i]) - 1.f);

    return 0;



                                                                 f(x)= \left\{\begin{matrix} e^{shift+scale*x} &if\ bias==-1.0f \\ bias^{shift+scale*x} & else \end{matrix}\right.


int Exp::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (base == -1.f)
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
            float* ptr =;

            for (int i=0; i<size; i++)
                ptr[i] = exp(shift + ptr[i] * scale);
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
            float* ptr =;

            for (int i=0; i<size; i++)
                ptr[i] = pow(base, (shift + ptr[i] * scale));

    return 0;



// 增加维度
int ExpandDims::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int dims = bottom_blob.dims;

    top_blob = bottom_blob;

    // 输入为一维
    if (dims == 1)
        // 沿着width方向增加维度
        if (expand_w)
            // 沿着height方向增加维度
            if (expand_h)
                top_blob = bottom_blob.reshape(1, 1, w, opt.blob_allocator);
            // 沿着channel方向增加维度
            else if (expand_c)
                top_blob = bottom_blob.reshape(1, w, 1, opt.blob_allocator);
            // 只在width方向增加维度
                top_blob = bottom_blob.reshape(1, w, opt.blob_allocator);
        // 沿着height方向增加维度
        else if (expand_h)
            // 沿着channel方向增加维度
            if (expand_c)
                top_blob = bottom_blob.reshape(w, 1, 1, opt.blob_allocator);
            // 只沿着height方向增加维度
                top_blob = bottom_blob.reshape(w, 1, opt.blob_allocator);
    // 输入维度为二
    else if (dims == 2)
        // 沿着width方向增加维度
        if (expand_w)
            top_blob = bottom_blob.reshape(1, w, h, opt.blob_allocator);
        // 沿着height方向增加维度
        else if (expand_h)
            top_blob = bottom_blob.reshape(w, 1, h, opt.blob_allocator);
        // 沿着channel方向增加维度
        else if (expand_c)
            top_blob = bottom_blob.reshape(w, h, 1, opt.blob_allocator);

    if (top_blob.empty())
        return -100;

    return 0;



int Flatten::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int channels = bottom_blob.c;
    size_t elemsize = bottom_blob.elemsize;
    int size = w * h;

    // 输出blob长度
    top_blob.create(size * channels, elemsize, opt.blob_allocator);
    if (top_blob.empty())
        return -100;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        const float* ptr =;
        float* outptr = (float*)top_blob + size * q;

        // 将输入按照channel展开
        for (int i=0; i<size; i++)
            outptr[i] = ptr[i];

    return 0;



                                                      f(x)=\left\{\begin{matrix} 0 & x<-2.5\\ 0.2*x+0.5 & -2.5\leq x\leq 2.5\\ 1 & x>2.5 \end{matrix}\right.


int HardSigmoid::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int q=0; q<channels; q++)
        float* ptr =;

        for (int i=0; i<size; i++)
            if (ptr[i] < lower)
                ptr[i] = 0.f;
            else if (ptr[i] > upper)
                ptr[i] = 1.f;
            // 0.2*x + 0.5
                ptr[i] = ptr[i] * alpha + beta;

    return 0;







