前馈神经网络
FFN
Constructor
主要构造函数头文件:
/**
* Implementation of a standard feed forward network.
*
* @tparam OutputLayerType The output layer type used to evaluate the network.
* @tparam InitializationRuleType Rule used to initialize the weight matrix.
* @tparam CustomLayers Any set of custom layers that could be a part of the
* feed forward network.
*/
template<
typename OutputLayerType = NegativeLogLikelihood<>,
typename InitializationRuleType = RandomInitialization,
typename... CustomLayers
>
class FFN
{
public:
//! Convenience typedef for the internal model construction.
using NetworkType = FFN<OutputLayerType, InitializationRuleType>;
/**
* Create the FFN object.
*
* Optionally, specify which initialize rule and performance function should
* be used.
*
* If you want to pass in a parameter and discard the original parameter
* object, be sure to use std::move to avoid unnecessary copy.
*
* @param outputLayer Output layer used to evaluate the network.
* @param initializeRule Optional instantiated InitializationRule object
* for initializing the network parameter.
*/
FFN(OutputLayerType outputLayer = OutputLayerType(),
InitializationRuleType initializeRule = InitializationRuleType());
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::FFN(
OutputLayerType outputLayer, InitializationRuleType initializeRule) :
outputLayer(std::move(outputLayer)),
initializeRule(std::move(initializeRule)),
width(0),
height(0),
reset(false),
numFunctions(0),
deterministic(false)
{
/* Nothing to do here. */
}
构造函数有两个主要的模板参数: OutputLayerType 和 InitializationRuleType,去看一下它们的默认实现
NegativeLogLikelihood 头文件:
/**
* Implementation of the negative log likelihood layer. The negative log
* likelihood layer expectes that the input contains log-probabilities for each
* class. The layer also expects a class index, in the range between 1 and the
* number of classes, as target when calling the Forward function.
*
* @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
* @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
* arma::sp_mat or arma::cube).
*/
template <
typename InputDataType = arma::mat,
typename OutputDataType = arma::mat
>
class NegativeLogLikelihood
{
public:
/**
* Create the NegativeLogLikelihoodLayer object.
*/
NegativeLogLikelihood();
/**
* Computes the Negative log likelihood.
*
* @param input Input data used for evaluating the specified function.
* @param target The target vector, that contains the class index in the range
* between 1 and the number of classes.
*/
template<typename InputType, typename TargetType>
typename InputType::elem_type Forward(const InputType& input,
const TargetType& target);
/**
* Ordinary feed backward pass of a neural network. The negative log
* likelihood layer expects that the input contains log-probabilities for
* each class. The layer also expects a class index, in the range between 1
* and the number of classes, as target when calling the Forward function.
*
* @param input The propagated input activation.
* @param target The target vector, that contains the class index in the range
* between 1 and the number of classes.
* @param output The calculated error.
*/
template<typename InputType, typename TargetType, typename OutputType>
void Backward(const InputType& input,
const TargetType& target,
OutputType& output);
//! Get the input parameter.
InputDataType& InputParameter() const {
return inputParameter; }
//! Modify the input parameter.
InputDataType& InputParameter() {
return inputParameter; }
//! Get the output parameter.
OutputDataType& OutputParameter() const {
return outputParameter; }
//! Modify the output parameter.
OutputDataType& OutputParameter() {
return outputParameter; }
//! Get the delta.
OutputDataType& Delta() const {
return delta; }
//! Modify the delta.
OutputDataType& Delta() {
return delta; }
/**
* Serialize the layer
*/
template<typename Archive>
void serialize(Archive& /* ar */, const unsigned int /* version */);
private:
//! Locally-stored delta object.
OutputDataType delta;
//! Locally-stored input parameter object.
InputDataType inputParameter;
//! Locally-stored output parameter object.
OutputDataType outputParameter;
}; // class NegativeLogLikelihood
实现:
template<typename InputDataType, typename OutputDataType>
NegativeLogLikelihood<InputDataType, OutputDataType>::NegativeLogLikelihood()
{
// Nothing to do here.
}
template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType>
typename InputType::elem_type
NegativeLogLikelihood<InputDataType, OutputDataType>::Forward(
const InputType& input,
const TargetType& target)
{
typedef typename InputType::elem_type ElemType;
ElemType output = 0;
for (size_t i = 0; i < input.n_cols; ++i)
{
size_t currentTarget = target(i) - 1;
Log::Assert(currentTarget < input.n_rows,
"Target class out of range.");
output -= input(currentTarget, i);
}
return output;
}
template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType, typename OutputType>
void NegativeLogLikelihood<InputDataType, OutputDataType>::Backward(
const InputType& input,
const TargetType& target,
OutputType& output)
{
output = arma::zeros<OutputType>(input.n_rows, input.n_cols);
for (size_t i = 0; i < input.n_cols; ++i)
{
size_t currentTarget = target(i) - 1;
Log::Assert(currentTarget < input.n_rows,
"Target class out of range.");
output(currentTarget, i) = -1;
}
}
template<typename InputDataType, typename OutputDataType>
template<typename Archive>
void NegativeLogLikelihood<InputDataType, OutputDataType>::serialize(
Archive& /* ar */,
const unsigned int /* version */)
{
// Nothing to do here.
}
负对数似然损失中重要的就是那两个 Forward , Backward 方法,我们不妨引入一些记号:
i n p u t : ( X 1 , ⋯ , X N ) , X i ∈ R n ∀ i ∈ [ 1 , N ] ⇒ [ x 11 x 12 ⋯ x 1 N ⋮ x n 1 x n 2 ⋯ x n N ] t a r g e t : ( y 1 , ⋯ , y N ) , y i ∈ [ 1 , m ] input: ( X_1, \cdots , X_N) \ , \quad X_i \in \mathbb{R}^n \ \ \forall \ i \in [1, N] \\[6pt] \Rightarrow \begin{bmatrix} x_{11} \ x_{12} \cdots \ x_{1N} \\ \vdots \\ x_{n1} \ x_{n2} \cdots \ x_{nN} \end{bmatrix} \\[6pt] target: (y_1 , \cdots , y_N) \ , \quad y_i \in [1, m] input:(X1,⋯,XN) ,Xi∈Rn ∀ i∈[1,N]⇒⎣⎢⎡x11 x12⋯ x1N⋮xn1 xn2⋯ xnN⎦⎥⎤target:(y1,⋯,yN) ,yi∈[1,m]
因此:
Forward:
o u t p u t = − ∑ i = 1 N x ( y i , i ) , y i ⩽ n output = - \sum_{i=1}^N x_{(y_i ,i)} \ , \quad y_i \leqslant n output=−i=1∑Nx(yi,i) ,yi⩽n
Backward:
( n × N ) : o u t p u t ( j , i ) = { − 1 , j = y i ( y i ⩽ n ) 0 , o t h e r w i s e (n \times N): \quad output_{(j, i)}= \begin{cases} -1 \ , \quad j = y_i \ \ (y_i \leqslant n) \\ 0 \ , \quad otherwise \end{cases} (n×N):output(j,i)={
−1 ,j=yi (yi⩽n)0 ,otherwise
RandomInitialization :
/**
* This class is used to initialize randomly the weight matrix.
*/
class RandomInitialization
{
public:
/**
* Initialize the random initialization rule with the given lower bound and
* upper bound.
*
* @param lowerBound The number used as lower bound.
* @param upperBound The number used as upper bound.
*/
RandomInitialization(const double lowerBound = -1,
const double upperBound = 1) :
lowerBound(lowerBound), upperBound(upperBound) {
}
/**
* Initialize the random initialization rule with the given bound.
* Using the negative of the bound as lower bound and the positive bound as
* upper bound.
*
* @param bound The number used as lower bound
*/
RandomInitialization(const double bound) :
lowerBound(-std::abs(bound)), upperBound(std::abs(bound)) {
}
/**
* Initialize randomly the elements of the specified weight matrix.
*
* @param W Weight matrix to initialize.
* @param rows Number of rows.
* @param cols Number of columns.
*/
template<typename eT>
void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols)
{
if (W.is_empty())
W.set_size(rows, cols);
W.randu();
W *= (upperBound - lowerBound);
W += lowerBound;
}
/**
* Initialize randomly the elements of the specified weight matrix.
*
* @param W Weight matrix to initialize.
*/
template<typename eT>
void Initialize(arma::Mat<eT>& W)
{
if (W.is_empty())
Log::Fatal << "Cannot initialize an empty matrix." << std::endl;
W.randu();
W *= (upperBound - lowerBound);
W += lowerBound;
}
/**
* Initialize randomly the elements of the specified weight 3rd order tensor.
*
* @param W Weight matrix to initialize.
* @param rows Number of rows.
* @param cols Number of columns.
* @param slices Number of slices.
*/
template<typename eT>
void Initialize(arma::Cube<eT>& W,
const size_t rows,
const size_t cols,
const size_t slices)
{
if (W.is_empty())
W.set_size(rows, cols, slices);
for (size_t i = 0; i < slices; ++i)
Initialize(W.slice(i), rows, cols);
}
/**
* Initialize randomly the elements of the specified weight 3rd order tensor.
*
* @param W Weight matrix to initialize.
*/
template<typename eT>
void Initialize(arma::Cube<eT>& W)
{
if (W.is_empty())
Log::Fatal << "Cannot initialize an empty cube." << std::endl;
for (size_t i = 0; i < W.n_slices; ++i)
Initialize(W.slice(i));
}
private:
//! The number used as lower bound.
double lowerBound;
//! The number used as upper bound.
double upperBound;
}; // class RandomInitialization
.randu() 在官方中的说明:
.randu() uses a uniform distribution in the [0,1] interval
因此,该初始化方法先产生服从 U ( 0 , 1 ) U(0, 1) U(0,1) 的初始值,再乘以 ( u p p e r B o u n d − l o w e r B o u n d ) (upperBound - lowerBound) (upperBound−lowerBound),加上 l o w e r B o u n d lowerBound lowerBound
有:
E ( W ) = ( u p p e r B o u n d + l o w e r B o u n d ) 2 D ( W ) = ( u p p e r B o u n d − l o w e r B o u n d ) 2 12 E(W) = \dfrac{(upperBound + lowerBound)}{2} \\[6pt] D(W) = \dfrac{(upperBound - lowerBound)^2}{12} E(W)=2(upperBound+lowerBound)D(W)=12(upperBound−lowerBound)2
Train
Train 头文件:
/**
* Train the feedforward network on the given input data using the given
* optimizer.
*
* This will use the existing model parameters as a starting point for the
* optimization. If this is not what you want, then you should access the
* parameters vector directly with Parameters() and modify it as desired.
*
* If you want to pass in a parameter and discard the original parameter
* object, be sure to use std::move to avoid unnecessary copy.
*
* @tparam OptimizerType Type of optimizer to use to train the model.
* @tparam CallbackTypes Types of Callback Functions.
* @param predictors Input training variables.
* @param responses Outputs results from input training variables.
* @param optimizer Instantiated optimizer used to train the model.
* @param callbacks Callback function for ensmallen optimizer `OptimizerType`.
* See https://www.ensmallen.org/docs.html#callback-documentation.
* @return The final objective of the trained model (NaN or Inf on error).
*/
template<typename OptimizerType, typename... CallbackTypes>
double Train(arma::mat predictors,
arma::mat responses,
OptimizerType& optimizer,
CallbackTypes&&... callbacks);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
template<typename OptimizerType, typename... CallbackTypes>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Train(
arma::mat predictors,
arma::mat responses,
OptimizerType& optimizer,
CallbackTypes&&... callbacks)
{
ResetData(std::move(predictors), std::move(responses));
WarnMessageMaxIterations<OptimizerType>(optimizer, this->predictors.n_cols);
// Train the model.
Timer::Start("ffn_optimization");
const double out = optimizer.Optimize(*this, parameter, callbacks...);
Timer::Stop("ffn_optimization");
Log::Info << "FFN::FFN(): final objective of trained model is " << out
<< "." << std::endl;
return out;
}
构造完模型后,就是利用给定的数据集和标签来进行训练,从实现来看,这不难理解:
利用 ensmallen 里的优化器,将自身作为待优化的函数传入,将参数 parameter 传入
参照之前介绍的 Adam 优化算法,可以猜到,该模型一定封装有 Evaluate 和 Gradient 函数
果不其然:
Evaluate
Evaluate 头文件:
/**
* Evaluate the feedforward network with the given parameters. This function
* is usually called by the optimizer to train the model.
*
* @param parameters Matrix model parameters.
*/
double Evaluate(const arma::mat& parameters);
/**
* Evaluate the feedforward network with the given parameters, but using only
* a number of data points. This is useful for optimizers such as SGD, which
* require a separable objective function.
*
* @param parameters Matrix model parameters.
* @param begin Index of the starting point to use for objective function
* evaluation.
* @param batchSize Number of points to be passed at a time to use for
* objective function evaluation.
* @param deterministic Whether or not to train or test the model. Note some
* layer act differently in training or testing mode.
*/
double Evaluate(const arma::mat& parameters,
const size_t begin,
const size_t batchSize,
const bool deterministic);
实现:
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
const arma::mat& parameters)
{
double res = 0;
for (size_t i = 0; i < predictors.n_cols; ++i)
res += Evaluate(parameters, i, 1, true);
return res;
}
template<typename OutputLayerType, typename InitializationRuleType,
typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
const arma::mat& /* parameters */,
const size_t begin,
const size_t batchSize,
const bool deterministic)
{
if (parameter.is_empty())
ResetParameters();
if (deterministic != this->deterministic)
{
this->deterministic = deterministic;
ResetDeterministic();
}
Forward(predictors.cols(begin, begin + batchSize - 1));
double res = outputLayer.Forward(
boost::apply_visitor(outputParameterVisitor, network.back()),
responses.cols(begin, begin + batchSize - 1));
for (size_t i = 0; i < network.size(); ++i)
{
res += boost::apply_visitor(lossVisitor, network[i]);
}
return res;
}
先看一下两个 Reset 方法:
Reset
ResetDeterministic
template<typename