参考文献依然是放前面:https://blog.csdn.net/caicaiatnbu/category_9096319.html
darknet版本: https://github.com/AlexeyAB/darknet,与原始的版本还是有一点区别的。
因为第一次读源码,我就直接按照参考文献的顺序来了,到时候再查漏补缺,加油!
今天看的是:activations,主要完成激活函数的前向计算以及激活函数的求导,激活函数的误差反向传播。
直接放代码注解:比较简单
#include "activations.h"
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <float.h>
// 获得定义的枚举类型的激活函数类别,如果没有对应的就返回relu
char *get_activation_string(ACTIVATION a)
{
switch(a){
case LOGISTIC:
return "logistic";
case LOGGY:
return "loggy";
case RELU:
return "relu";
case ELU:
return "elu";
case SELU:
return "selu";
case GELU:
return "gelu";
case RELIE:
return "relie";
case RAMP:
return "ramp";
case LINEAR:
return "linear";
case TANH:
return "tanh";
case PLSE:
return "plse";
case LEAKY:
return "leaky";
case STAIR:
return "stair";
case HARDTAN:
return "hardtan";
case LHTAN:
return "lhtan";
default:
break;
}
return "relu";
}
// 获得定义的枚举类型的激活函数类别,如果激活函数不存在,那么就使用relu
ACTIVATION get_activation(char *s)
{
if (strcmp(s, "logistic")==0) return LOGISTIC;
if (strcmp(s, "swish") == 0) return SWISH;
if (strcmp(s, "mish") == 0) return MISH;
if (strcmp(s, "normalize_channels") == 0) return NORM_CHAN;
if (strcmp(s, "normalize_channels_softmax") == 0) return NORM_CHAN_SOFTMAX;
if (strcmp(s, "normalize_channels_softmax_maxval") == 0) return NORM_CHAN_SOFTMAX_MAXVAL;
if (strcmp(s, "loggy")==0) return LOGGY;
if (strcmp(s, "relu")==0) return RELU;
if (strcmp(s, "relu6") == 0) return RELU6;
if (strcmp(s, "elu")==0) return ELU;
if (strcmp(s, "selu") == 0) return SELU;
if (strcmp(s, "gelu") == 0) return GELU;
if (strcmp(s, "relie")==0) return RELIE;
if (strcmp(s, "plse")==0) return PLSE;
if (strcmp(s, "hardtan")==0) return HARDTAN;
if (strcmp(s, "lhtan")==0) return LHTAN;
if (strcmp(s, "linear")==0) return LINEAR;
if (strcmp(s, "ramp")==0) return RAMP;
if (strcmp(s, "leaky")==0) return LEAKY;
if (strcmp(s, "tanh")==0) return TANH;
if (strcmp(s, "stair")==0) return STAIR;
fprintf(stderr, "Couldn't find activation function %s, going with ReLU\n", s);
return RELU;
}
// 根据不同的激活函数类型,调用不同的激活函数处理输入
float activate(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_activate(x);
case LOGISTIC:
return logistic_activate(x);
case LOGGY:
return loggy_activate(x);
case RELU:
return relu_activate(x);
case ELU:
return elu_activate(x);
case SELU:
return selu_activate(x);
case GELU:
return gelu_activate(x);
case RELIE:
return relie_activate(x);
case RAMP:
return ramp_activate(x);
case LEAKY:
return leaky_activate(x);
case TANH:
return tanh_activate(x);
case PLSE:
return plse_activate(x);
case STAIR:
return stair_activate(x);
case HARDTAN:
return hardtan_activate(x);
case LHTAN:
return lhtan_activate(x);
}
return 0;
}
// 用激活函数处理输入x
/**
* @param x 待处理的数组:一般为网络层每个神经元的加权输入Wx+b,在本函数中也是输出
* @param n x中含有多少元素
* @param a 激活函数类型
* 说明: 该函数会逐个处理x中的元素;该函数一般用于每一层网络的前向传播网络中;
*
*/
void activate_array(float *x, const int n, const ACTIVATION a)
{
int i;
if (a == LINEAR) {}//如果是线性的,直接跳过这一步,不做激活函数的操作
else if (a == LEAKY) {
#pragma omp parallel for//为什么把leaky和logistic激活函数单独放出来,感觉只是为了并行加速方便
for (i = 0; i < n; ++i) {
x[i] = leaky_activate(x[i]);
}
}
else if (a == LOGISTIC) {
#pragma omp parallel for
for (i = 0; i < n; ++i) {
x[i] = logistic_activate(x[i]);
}
}
else {
for (i = 0; i < n; ++i) {
x[i] = activate(x[i], a);
}
}
}
//swish激活函数,应该是作者后续更新的部分
//swish(x)=x*sigmoid(x)公式
void activate_array_swish(float *x, const int n, float * output_sigmoid, float * output)
{
int i;
#pragma omp parallel for//并行加速
for (i = 0; i < n; ++i) {
float x_val = x[i];
float sigmoid = logistic_activate(x_val);//先计算sigmoid(x)
output_sigmoid[i] = sigmoid;//获得sigmoid(x)的输出
output[i] = x_val * sigmoid;//再计算x*sigmoid(x),获得激活函数的输出
}
}
// https://github.com/digantamisra98/Mish
//mish激活函数:mish(x)=x * tanh(ln(1+e^x))
/*
static inline float softplus_activate(float x, float threshold) {
if (x > threshold) return x; // too large
else if (x < -threshold) return expf(x); // too small
return logf(expf(x) + 1);
}
*/
void activate_array_mish(float *x, const int n, float * activation_input, float * output)
{
const float MISH_THRESHOLD = 20;
int i;
#pragma omp parallel for
for (i = 0; i < n; ++i) {
float x_val = x[i];
activation_input[i] = x_val; // store value before activation
//softplus_activate定义如上,还蛮好理解的
output[i] = x_val * tanh_activate( softplus_activate(x_val, MISH_THRESHOLD) );
}
}
// activate_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output)
//l.outputs:一张输入图片对应的输出元素个数,对于一些网络,可由输入图片的尺寸及相关参数计算出,可以通过输入尺寸以及跨度、核大小计算出;
//做归一化,每个batch做一次
void activate_array_normalize_channels(float *x, const int n, int batch, int channels, int wh_step, float *output)
{
int size = n / channels;//n=l.out_c*l.out_w*l.out_h*l.batch
//size=l.out_w*l.out_h*l.batch
int i;
#pragma omp parallel for //并行加速
for (i = 0; i < size; ++i) {
//wh_step=l.out_w*l.out_h
int wh_i = i % wh_step;//wh_i~[0,l.out_w*l.out_h-1]
int b = i / wh_step;//b~[0,l.batch]
const float eps = 0.0001;//防止被除数为0出错
if (i < size) {
float sum = eps;
int k;
for (k = 0; k < channels; ++k) {
//wh_i + k * wh_step + b*wh_step*channels:第b个batch的第k个通道中的第wh_i特征值
float val = x[wh_i + k * wh_step + b*wh_step*channels];
if (val > 0) sum += val;//如果值val大于0,就加上val
}
//上面for循环结束后,sum的值为第b个batch中所有特征值大于0的值的和
for (k = 0; k < channels; ++k) {
float val = x[wh_i + k * wh_step + b*wh_step*channels];
if (val > 0) val = val / sum;//对大于0的特征值除以总数,做归一化
else val = 0;//小于0 的特征值就直接置0
output[wh_i + k * wh_step + b*wh_step*channels] = val;//放入到输出的数组中去
}
}
}
}
// softmax归一化,每个batch做一次
// activate_array_normalize_channels_softmax(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.output, 0);
void activate_array_normalize_channels_softmax(float *x, const int n, int batch, int channels, int wh_step, float *output, int use_max_val)
{
int size = n / channels;//n=l.out_c*l.out_w*l.out_h*l.batch
//size=l.out_w*l.out_h*l.batch
int i;
#pragma omp parallel for //并行加速
for (i = 0; i < size; ++i) {
//wh_step=l.out_w*l.out_h
int wh_i = i % wh_step;//wh_i~[0,l.out_w*l.out_h-1]
int b = i / wh_step;//b~[0,l.batch]
const float eps = 0.0001;//防止被除数为0出错
if (i < size) {
float sum = eps;
float max_val = -FLT_MAX;
int k;
//如果使用最大值
if (use_max_val) {
for (k = 0; k < channels; ++k) {
float val = x[wh_i + k * wh_step + b*wh_step*channels];
if (val > max_val || k == 0) max_val = val;//找到最大特征值作为max_val
}
}
else
max_val = 0;
//进行归一化
for (k = 0; k < channels; ++k) {
float val = x[wh_i + k * wh_step + b*wh_step*channels];
sum += expf(val - max_val);//sum=sum+exp(x-max_val)
}
for (k = 0; k < channels; ++k) {
float val = x[wh_i + k * wh_step + b*wh_step*channels];
val = expf(val - max_val) / sum;//val=exp(x-max_val)/sum
output[wh_i + k * wh_step + b*wh_step*channels] = val;
}
}
}
}
//梯度归一化
//x 当前层的所有输出(维度 l.batch * l.out_c * l.out_w * l.out_h)
void gradient_array_normalize_channels_softmax(float *x, const int n, int batch, int channels, int wh_step, float *delta)
{
int size = n / channels;//n=l.out_c*l.out_w*l.out_h*l.batch
//size=l.out_w*l.out_h*l.batch
int i;
#pragma omp parallel for
for (i = 0; i < size; ++i) {
//wh_step=l.out_w*l.out_h
int wh_i = i % wh_step;//wh_i~[0,l.out_w*l.out_h-1]
int b = i / wh_step;//b~[0,l.batch]
if (i < size) {
float grad = 0;
int k;
for (k = 0; k < channels; ++k) {
//wh_i + k * wh_step + b*wh_step*channels:第b个batch的第k个通道中的第wh_i特征值
const int index = wh_i + k * wh_step + b*wh_step*channels;
float out = x[index];
float d = delta[index];
grad += out*d;//该batch下所有通道下的梯度(输出*误差值)
}
for (k = 0; k < channels; ++k) {
const int index = wh_i + k * wh_step + b*wh_step*channels;
float d = delta[index];
d = d * grad;
delta[index] = d;//新的误差值
}
}
}
}
// gradient_array_normalize_channels(l.output, l.outputs*l.batch, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
//gradient_array_normalize_channels和gradient_array_normalize_channels_softmax没区别
void gradient_array_normalize_channels(float *x, const int n, int batch, int channels, int wh_step, float *delta)
{
int size = n / channels;
int i;
#pragma omp parallel for //并行加速
for (i = 0; i < size; ++i) {
int wh_i = i % wh_step;
int b = i / wh_step;
if (i < size) {
float grad = 0;
int k;
for (k = 0; k < channels; ++k) {
const int index = wh_i + k * wh_step + b*wh_step*channels;
float out = x[index];
float d = delta[index];
grad += out*d;
}
for (k = 0; k < channels; ++k) {
const int index = wh_i + k * wh_step + b*wh_step*channels;
if (x[index] > 0) {
float d = delta[index];
d = d * grad;
delta[index] = d;
}
}
}
}
}
// 根据不同的激活函数求取对输入的梯度
float gradient(float x, ACTIVATION a)
{
switch(a){
case LINEAR:
return linear_gradient(x);
case LOGISTIC:
return logistic_gradient(x);
case LOGGY:
return loggy_gradient(x);
case RELU:
return relu_gradient(x);
case RELU6:
return relu6_gradient(x);
case NORM_CHAN:
//return relu_gradient(x);
case NORM_CHAN_SOFTMAX_MAXVAL:
//...
case NORM_CHAN_SOFTMAX:
printf(" Error: should be used custom NORM_CHAN or NORM_CHAN_SOFTMAX-function for gradient \n");
exit(0);
return 0;
case ELU:
return elu_gradient(x);
case SELU:
return selu_gradient(x);
case GELU:
return gelu_gradient(x);
case RELIE:
return relie_gradient(x);
case RAMP:
return ramp_gradient(x);
case LEAKY:
return leaky_gradient(x);
case TANH:
return tanh_gradient(x);
case PLSE:
return plse_gradient(x);
case STAIR:
return stair_gradient(x);
case HARDTAN:
return hardtan_gradient(x);
case LHTAN:
return lhtan_gradient(x);
}
return 0;
}
// 计算激活函数对加权输入的导数, 并乘以delta,得到当前层最终的delta(误差项)
/**
* 计算激活函数对加权输入的导数,并乘以delta,得到当前层最终的误差项delta
* @param x 当前层的所有输出(维度 l.batch * l.out_c * l.out_w * l.out_h)
* @param n l.output维度,即为l.batch * l.out_c * l.out_w * l.out_h (包含整个batch)
* @param a 激活函数类型
* @param delta 当前层误差(与当前输入的x维度一样)
*
* 说明:该函数不但计算了激活函数对于加权输入的导数,还将该导数乘以了之前完成计算的误差项delta(对应元素相乘),因此调用该函数之后,
* 将得到最终的误差项
*
* 这里直接利用输出值求激活函数关于输入的导数值是因为神经网络中所使用的绝大部分激活函数,其关于输入的导数值都可以描述为输出值的函数表达式,
* 比如对于Sigmoid激活函数(记作f(x)),其导数值为 f'(x)=f(x) * (1 - f(x)), 因此如果给出 y = f(x), 那么 f'(x)=y*(1-y),只需要输出值y就可以了,
* 不需要输如x的值。
*
* 关于l.delta的初值,比如卷积层的backward_convolutional_layer()函数,并没有对l.delta赋初值,
* 只是用calloc为其动态分配了内存。 但是整个网络会以COST或者REGION为最后一层,这些层中会对l.delta赋初值,
* 又由于l.delta是由后向前逐层传播。因此,当反向执行到某一层时,l.delta的值都不会为0.
*/
void gradient_array(const float *x, const int n, const ACTIVATION a, float *delta)
{
int i;
#pragma omp parallel for
for(i = 0; i < n; ++i){
delta[i] *= gradient(x[i], a);
}
}
// https://github.com/BVLC/caffe/blob/04ab089db018a292ae48d51732dd6c66766b36b6/src/caffe/layers/swish_layer.cpp#L54-L56
//swish 激活函数的梯度计算,注意这里x都是当前层的所有输出,output
void gradient_array_swish(const float *x, const int n, const float * sigmoid, float * delta)
{
int i;
#pragma omp parallel for
for (i = 0; i < n; ++i) {
float swish = x[i];
delta[i] *= swish + sigmoid[i]*(1 - swish);
}
}
// https://github.com/digantamisra98/Mish
//mish 激活函数的梯度计算,注意这里x都是当前层的所有输出,output
void gradient_array_mish(const int n, const float * activation_input, float * delta)
{
int i;
#pragma omp parallel for
for (i = 0; i < n; ++i) {
const float MISH_THRESHOLD = 20.0f;
// implementation from TensorFlow: https://github.com/tensorflow/addons/commit/093cdfa85d334cbe19a37624c33198f3140109ed
// implementation from Pytorch: https://github.com/thomasbrandon/mish-cuda/blob/master/csrc/mish.h#L26-L31
float inp = activation_input[i];
const float sp = softplus_activate(inp, MISH_THRESHOLD);
const float grad_sp = 1 - exp(-sp);
const float tsp = tanh(sp);
const float grad_tsp = (1 - tsp*tsp) * grad_sp;
const float grad = inp * grad_tsp + tsp;
delta[i] *= grad;
//float x = activation_input[i];
//float d = 2 * expf(x) + expf(2 * x) + 2;
//float w = 4 * (x + 1) + 4 * expf(2 * x) + expf(3 * x) + expf(x)*(4 * x + 6);
//float derivative = expf(x) * w / (d * d);
//delta[i] *= derivative;
}
}
简单撒花~