FFN(mlpack)

最新推荐文章于 2024-04-03 14:32:31 发布

VIP文章胧月夜い

最新推荐文章于 2024-04-03 14:32:31 发布

阅读量2.9k

点赞数

文章标签：神经网络深度学习

本文链接：https://blog.csdn.net/qq_46013251/article/details/119089344

版权

前馈神经网络

FFN
- Constructor
- Train
- Evaluate
- - Reset
  - Forward
  - Loss
- Gradient
- EvaluateWithGradient
- Predict
Layer
- Linear
- - Constructor
  - Forward
  - Backward
  - Gradient
- Convolution
- - Constructor
  - Forward
  - Backward
  - Gradient
Test
Reference

FFN

Constructor

主要构造函数头文件：

/**
 * Implementation of a standard feed forward network.
 *
 * @tparam OutputLayerType The output layer type used to evaluate the network.
 * @tparam InitializationRuleType Rule used to initialize the weight matrix.
 * @tparam CustomLayers Any set of custom layers that could be a part of the
 *         feed forward network.
 */
template<
  typename OutputLayerType = NegativeLogLikelihood<>,
  typename InitializationRuleType = RandomInitialization,
  typename... CustomLayers
>
class FFN
{
   
 public:
  //! Convenience typedef for the internal model construction.
  using NetworkType = FFN<OutputLayerType, InitializationRuleType>;

  /**
   * Create the FFN object.
   *
   * Optionally, specify which initialize rule and performance function should
   * be used.
   *
   * If you want to pass in a parameter and discard the original parameter
   * object, be sure to use std::move to avoid unnecessary copy.
   *
   * @param outputLayer Output layer used to evaluate the network.
   * @param initializeRule Optional instantiated InitializationRule object
   *        for initializing the network parameter.
   */
  FFN(OutputLayerType outputLayer = OutputLayerType(),
      InitializationRuleType initializeRule = InitializationRuleType());

实现：

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::FFN(
    OutputLayerType outputLayer, InitializationRuleType initializeRule) :
    outputLayer(std::move(outputLayer)),
    initializeRule(std::move(initializeRule)),
    width(0),
    height(0),
    reset(false),
    numFunctions(0),
    deterministic(false)
{
   
  /* Nothing to do here. */
}

构造函数有两个主要的模板参数： OutputLayerType 和 InitializationRuleType，去看一下它们的默认实现

NegativeLogLikelihood 头文件：

/**
 * Implementation of the negative log likelihood layer. The negative log
 * likelihood layer expectes that the input contains log-probabilities for each
 * class. The layer also expects a class index, in the range between 1 and the
 * number of classes, as target when calling the Forward function.
 *
 * @tparam InputDataType Type of the input data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 * @tparam OutputDataType Type of the output data (arma::colvec, arma::mat,
 *         arma::sp_mat or arma::cube).
 */
template <
    typename InputDataType = arma::mat,
    typename OutputDataType = arma::mat
>
class NegativeLogLikelihood
{
   
 public:
  /**
   * Create the NegativeLogLikelihoodLayer object.
   */
  NegativeLogLikelihood();

  /**
   * Computes the Negative log likelihood.
   *
   * @param input Input data used for evaluating the specified function.
   * @param target The target vector, that contains the class index in the range
   *        between 1 and the number of classes.
   */
  template<typename InputType, typename TargetType>
  typename InputType::elem_type Forward(const InputType& input,
                                        const TargetType& target);

  /**
   * Ordinary feed backward pass of a neural network. The negative log
   * likelihood layer expects that the input contains log-probabilities for
   * each class. The layer also expects a class index, in the range between 1
   * and the number of classes, as target when calling the Forward function.
   *
   * @param input The propagated input activation.
   * @param target The target vector, that contains the class index in the range
   *        between 1 and the number of classes.
   * @param output The calculated error.
   */
  template<typename InputType, typename TargetType, typename OutputType>
  void Backward(const InputType& input,
                const TargetType& target,
                OutputType& output);

  //! Get the input parameter.
  InputDataType& InputParameter() const {
    return inputParameter; }
  //! Modify the input parameter.
  InputDataType& InputParameter() {
    return inputParameter; }

  //! Get the output parameter.
  OutputDataType& OutputParameter() const {
    return outputParameter; }
  //! Modify the output parameter.
  OutputDataType& OutputParameter() {
    return outputParameter; }

  //! Get the delta.
  OutputDataType& Delta() const {
    return delta; }
  //! Modify the delta.
  OutputDataType& Delta() {
    return delta; }

  /**
   * Serialize the layer
   */
  template<typename Archive>
  void serialize(Archive& /* ar */, const unsigned int /* version */);

 private:
  //! Locally-stored delta object.
  OutputDataType delta;

  //! Locally-stored input parameter object.
  InputDataType inputParameter;

  //! Locally-stored output parameter object.
  OutputDataType outputParameter;
}; // class NegativeLogLikelihood

实现：

template<typename InputDataType, typename OutputDataType>
NegativeLogLikelihood<InputDataType, OutputDataType>::NegativeLogLikelihood()
{
   
  // Nothing to do here.
}

template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType>
typename InputType::elem_type
NegativeLogLikelihood<InputDataType, OutputDataType>::Forward(
    const InputType& input,
    const TargetType& target)
{
   
  typedef typename InputType::elem_type ElemType;
  ElemType output = 0;
  for (size_t i = 0; i < input.n_cols; ++i)
  {
   
    size_t currentTarget = target(i) - 1;
    Log::Assert(currentTarget < input.n_rows,
        "Target class out of range.");

    output -= input(currentTarget, i);
  }

  return output;
}

template<typename InputDataType, typename OutputDataType>
template<typename InputType, typename TargetType, typename OutputType>
void NegativeLogLikelihood<InputDataType, OutputDataType>::Backward(
      const InputType& input,
      const TargetType& target,
      OutputType& output)
{
   
  output = arma::zeros<OutputType>(input.n_rows, input.n_cols);
  for (size_t i = 0; i < input.n_cols; ++i)
  {
   
    size_t currentTarget = target(i) - 1;
    Log::Assert(currentTarget < input.n_rows,
        "Target class out of range.");

    output(currentTarget, i) = -1;
  }
}

template<typename InputDataType, typename OutputDataType>
template<typename Archive>
void NegativeLogLikelihood<InputDataType, OutputDataType>::serialize(
    Archive& /* ar */,
    const unsigned int /* version */)
{
   
  // Nothing to do here.
}

负对数似然损失中重要的就是那两个 Forward , Backward 方法，我们不妨引入一些记号：
$X_1, \cdots , X_N) \ , \quad X_i \in \mathbb{R}^n \ \ \forall \ i \in [1, N] \\[6pt] \Rightarrow \begin{bmatrix} x_{11} \ x_{12} \cdots \ x_{1N} \\ \vdots \\ x_{n1} \ x_{n2} \cdots \ x_{nN} \end{bmatrix} \\[6pt] target: (y_1 , \cdots , y_N) \ , \quad y_i \in [1, m]$
因此：
Forward:
$\sum_{i=1}^N x_{(y_i ,i)} \ , \quad y_i \leqslant n$
Backward:
$\times N): \quad output_{(j, i)}= \begin{cases} -1 \ , \quad j = y_i \ \ (y_i \leqslant n) \\ 0 \ , \quad otherwise \end{cases}$

RandomInitialization ：

/**
 * This class is used to initialize randomly the weight matrix.
 */
class RandomInitialization
{
   
 public:
  /**
   * Initialize the random initialization rule with the given lower bound and
   * upper bound.
   *
   * @param lowerBound The number used as lower bound.
   * @param upperBound The number used as upper bound.
   */
  RandomInitialization(const double lowerBound = -1,
                       const double upperBound = 1) :
      lowerBound(lowerBound), upperBound(upperBound) {
    }

  /**
   * Initialize the random initialization rule with the given bound.
   * Using the negative of the bound as lower bound and the positive bound as
   * upper bound.
   *
   * @param bound The number used as lower bound
   */
  RandomInitialization(const double bound) :
      lowerBound(-std::abs(bound)), upperBound(std::abs(bound)) {
    }

  /**
   * Initialize randomly the elements of the specified weight matrix.
   *
   * @param W Weight matrix to initialize.
   * @param rows Number of rows.
   * @param cols Number of columns.
   */
  template<typename eT>
  void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols)
  {
   
    if (W.is_empty())
      W.set_size(rows, cols);

    W.randu();
    W *= (upperBound - lowerBound);
    W += lowerBound;
  }

  /**
   * Initialize randomly the elements of the specified weight matrix.
   *
   * @param W Weight matrix to initialize.
   */
  template<typename eT>
  void Initialize(arma::Mat<eT>& W)
  {
   
    if (W.is_empty())
      Log::Fatal << "Cannot initialize an empty matrix." << std::endl;

    W.randu();
    W *= (upperBound - lowerBound);
    W += lowerBound;
  }

  /**
   * Initialize randomly the elements of the specified weight 3rd order tensor.
   *
   * @param W Weight matrix to initialize.
   * @param rows Number of rows.
   * @param cols Number of columns.
   * @param slices Number of slices.
   */
  template<typename eT>
  void Initialize(arma::Cube<eT>& W,
                  const size_t rows,
                  const size_t cols,
                  const size_t slices)
  {
   
    if (W.is_empty())
      W.set_size(rows, cols, slices);

    for (size_t i = 0; i < slices; ++i)
      Initialize(W.slice(i), rows, cols);
  }

  /**
   * Initialize randomly the elements of the specified weight 3rd order tensor.
   *
   * @param W Weight matrix to initialize.
   */
  template<typename eT>
  void Initialize(arma::Cube<eT>& W)
  {
   
    if (W.is_empty())
      Log::Fatal << "Cannot initialize an empty cube." << std::endl;

    for (size_t i = 0; i < W.n_slices; ++i)
      Initialize(W.slice(i));
  }

 private:
  //! The number used as lower bound.
  double lowerBound;

  //! The number used as upper bound.
  double upperBound;
}; // class RandomInitialization

.randu() 在官方中的说明：

.randu() uses a uniform distribution in the [0,1] interval

因此，该初始化方法先产生服从 $U (0, 1)$ 的初始值，再乘以 $(u p p e r B o u n d - l o w e r B o u n d)$ ，加上 $l o w e r B o u n d$
有：
$\dfrac{(upperBound + lowerBound)}{2} \\[6pt] D(W) = \dfrac{(upperBound - lowerBound)^2}{12}$

Train

Train 头文件：

  /**
   * Train the feedforward network on the given input data using the given
   * optimizer.
   *
   * This will use the existing model parameters as a starting point for the
   * optimization. If this is not what you want, then you should access the
   * parameters vector directly with Parameters() and modify it as desired.
   *
   * If you want to pass in a parameter and discard the original parameter
   * object, be sure to use std::move to avoid unnecessary copy.
   *
   * @tparam OptimizerType Type of optimizer to use to train the model.
   * @tparam CallbackTypes Types of Callback Functions.
   * @param predictors Input training variables.
   * @param responses Outputs results from input training variables.
   * @param optimizer Instantiated optimizer used to train the model.
   * @param callbacks Callback function for ensmallen optimizer `OptimizerType`.
   *      See https://www.ensmallen.org/docs.html#callback-documentation.
   * @return The final objective of the trained model (NaN or Inf on error).
   */
  template<typename OptimizerType, typename... CallbackTypes>
  double Train(arma::mat predictors,
               arma::mat responses,
               OptimizerType& optimizer,
               CallbackTypes&&... callbacks);

实现：

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
template<typename OptimizerType, typename... CallbackTypes>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Train(
      arma::mat predictors,
      arma::mat responses,
      OptimizerType& optimizer,
      CallbackTypes&&... callbacks)
{
   
  ResetData(std::move(predictors), std::move(responses));

  WarnMessageMaxIterations<OptimizerType>(optimizer, this->predictors.n_cols);

  // Train the model.
  Timer::Start("ffn_optimization");
  const double out = optimizer.Optimize(*this, parameter, callbacks...);
  Timer::Stop("ffn_optimization");

  Log::Info << "FFN::FFN(): final objective of trained model is " << out
      << "." << std::endl;
  return out;
}

构造完模型后，就是利用给定的数据集和标签来进行训练，从实现来看，这不难理解：
利用 ensmallen 里的优化器，将自身作为待优化的函数传入，将参数 parameter 传入

参照之前介绍的 Adam 优化算法，可以猜到，该模型一定封装有 Evaluate 和 Gradient 函数

果不其然：

Evaluate

Evaluate 头文件：

  /**
   * Evaluate the feedforward network with the given parameters. This function
   * is usually called by the optimizer to train the model.
   *
   * @param parameters Matrix model parameters.
   */
  double Evaluate(const arma::mat& parameters);

   /**
   * Evaluate the feedforward network with the given parameters, but using only
   * a number of data points. This is useful for optimizers such as SGD, which
   * require a separable objective function.
   *
   * @param parameters Matrix model parameters.
   * @param begin Index of the starting point to use for objective function
   *        evaluation.
   * @param batchSize Number of points to be passed at a time to use for
   *        objective function evaluation.
   * @param deterministic Whether or not to train or test the model. Note some
   *        layer act differently in training or testing mode.
   */
  double Evaluate(const arma::mat& parameters,
                  const size_t begin,
                  const size_t batchSize,
                  const bool deterministic);

实现：

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
    const arma::mat& parameters)
{
   
  double res = 0;
  for (size_t i = 0; i < predictors.n_cols; ++i)
    res += Evaluate(parameters, i, 1, true);

  return res;
}

template<typename OutputLayerType, typename InitializationRuleType,
         typename... CustomLayers>
double FFN<OutputLayerType, InitializationRuleType, CustomLayers...>::Evaluate(
    const arma::mat& /* parameters */,
    const size_t begin,
    const size_t batchSize,
    const bool deterministic)
{
   
  if (parameter.is_empty())
    ResetParameters();

  if (deterministic != this->deterministic)
  {
   
    this->deterministic = deterministic;
    ResetDeterministic();
  }

  Forward(predictors.cols(begin, begin + batchSize - 1));
  double res = outputLayer.Forward(
      boost::apply_visitor(outputParameterVisitor, network.back()),
      responses.cols(begin, begin + batchSize - 1));

  for (size_t i = 0; i < network.size(); ++i)
  {
   
    res += boost::apply_visitor(lossVisitor, network[i]);
  }

  return res;
}