【贝叶斯神经网络训练】（torch实现）

改名字了我
已于 2022-10-29 09:48:01 修改
阅读量2.7k
点赞数 9
文章标签：深度学习神经网络 python
于 2022-05-26 16:09:19 首次发布
本文链接：https://blog.csdn.net/qq_41085087/article/details/124986637
版权
从这里https://blog.csdn.net/dhaiuda/article/details/106383465学习到的，只是在其中加了批注而已，便于自己理解，有些地方理解可能不对，一起学习！
# -*- coding: utf-8 -*-
# @Time : 2022/5/10 9:54
# @Author : panY
# @File : learning.py
# @Software: PyCharm
# 本节实现的BNN为一个单隐藏层神经网络，其输入大小为1，输出大小为1。用于拟合函数-x^4 + 3x^2 + 1−x+1进行回归预测
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
import numpy as np
import matplotlib.pyplot as plt


#  Tensorflow  Keras

# BNN层，类似于BP网络的Linear层，与BP网络类似，一层BNN层由weight和bias组成，weight和bias都具有均值和方差
class Linear_BBB(nn.Module):
    """
        Layer of our BNN.
    """
    """
        作为隐藏层的时候，input_features是输入特征个数，output_features是输出的神经元个数
        作为输出层的时候，input_features是神经元个数，output_features是输出的最后结果个数
    """

    def __init__(self, input_features, output_features, prior_var=1.):
        """
            Initialization of our layer : our prior is a normal distribution
            centered in 0 and of variance 20.
        """
        # initialize layers
        super().__init__()
        # set input and output dimensions
        self.input_features = input_features
        self.output_features = output_features

        """
            nn.Parameter是继承自torch.Tensor的子类，其主要作用是作为nn.Module中的可训练参数使用。
            它与torch.Tensor的区别就是nn.Parameter会自动被认为是module的可训练参数，即加入到parameter()这个迭代器中去；
            而module中非nn.Parameter()的普通tensor是不在parameter中的。
            nn.Parameter()添加的参数会被添加到Parameters列表中，会被送入优化器中随训练一起学习更新
            这些值会随着训练一起更新，得到最终的值
        """
        """
            torch.zeros 返回一个形状为为size,类型为torch.dtype，里面的每一个值都是0的tensor
            torch.zeros(3,4) 生成一个3行4列的tensor [[0., 0., 0., 0.],[0., 0., 0., 0.],[0., 0., 0., 0.]]
        """

        # initialize mu(μ) and rho（ρ） parameters for the weights of the layer
        self.w_mu = nn.Parameter(torch.zeros(output_features, input_features))
        self.w_rho = nn.Parameter(torch.zeros(output_features, input_features))

        """
            bias 偏差（偏置）的主要功能是为每个节点提供一个可训练的常量值(除了节点接收的正常输入之外)
        """
        # initialize mu and rho parameters for the layer's bias
        self.b_mu = nn.Parameter(torch.zeros(output_features))
        self.b_rho = nn.Parameter(torch.zeros(output_features))

        # initialize weight samples (these will be calculated whenever the layer makes a prediction)
        self.w = None
        self.b = None

        """
            贝叶斯的先验分布，这里是一个正态分布
        """
        # initialize prior distribution for all of the weights and biases
        self.prior = torch.distributions.Normal(0, prior_var)

    def forward(self, input):
        """
            Optimization process
        """
        """
            self.w_mu.shape shape返回的是torch.Size(row,col)
        """
        # sample weights
        w_epsilon = Normal(0, 1).sample(self.w_mu.shape)
        self.w = self.w_mu + torch.log(1 + torch.exp(self.w_rho)) * w_epsilon

        # sample bias
        b_epsilon = Normal(0, 1).sample(self.b_mu.shape)
        self.b = self.b_mu + torch.log(1 + torch.exp(self.b_rho)) * b_epsilon

        """
            log_prob(value)是计算value在定义的正态分布（mean,1）中对应的概率的对数
            这里就是在先验分布中将样本权重self.w对应的概率的对数计算出来
        """
        # record log prior by evaluating log pdf of prior at sampled weight and bias
        # 计算log p(w)，用于后续计算loss
        w_log_prior = self.prior.log_prob(self.w)
        b_log_prior = self.prior.log_prob(self.b)
        """
            这里才是最终的先验分布？将样本的概率分布的对数加上偏差的概率分布的对数 
        """
        self.log_prior = torch.sum(w_log_prior) + torch.sum(b_log_prior)

        # record log variational posterior by evaluating log pdf of normal distribution defined by parameters
        # with respect at the sampled values
        # 计算 log p(w|\theta)，用于后续计算loss
        self.w_post = Normal(self.w_mu.data, torch.log(1 + torch.exp(self.w_rho)))
        self.b_post = Normal(self.b_mu.data, torch.log(1 + torch.exp(self.b_rho)))
        self.log_post = self.w_post.log_prob(self.w).sum() + self.b_post.log_prob(self.b).sum()
        # 对传入数据应用线性变换 返回:math:`y = xA^T + b`.
        return F.linear(input, self.w, self.b)


class MLP_BBB(nn.Module):
    """
    这是一个多层网络
    """
    """
    hidden_units 隐藏单元，可以输入不同的值来得到较优的结果 32、34、36等
    """

    def __init__(self, hidden_units, noise_tol=.1, prior_var=1.):
        # initialize the network like you would with a standard multilayer perceptron, but using the BBB layer
        super().__init__()
        # Linear_BBB构造参数 input_features, output_features, prior_var=1.
        self.hidden = Linear_BBB(1, hidden_units, prior_var=prior_var)  # 隐藏层？贝叶斯
        self.out = Linear_BBB(hidden_units, 1, prior_var=prior_var)  # 输出层？也是贝叶斯
        self.noise_tol = noise_tol  # we will use the noise tolerance to calculate our likelihood

    def forward(self, x):
        """
        Sigmoid函数常被用作神经网络的阈值函数(激活函数)，将变量映射到0,1之间，该函数单调递增且以（0，0.5）对称，在两端变化速度较慢。作用：引入非线性
        引入非线性相当于将输入的值在结合权重和偏差之后经过这个函数后得到另一个值（要不然输入的值和输出的值就一样，不会变，那一直迭代都没什么区别？）
        用这个得到的值来进行拟合，这个值是最终得到的值
        所以贝叶斯在这里面就是改变了节点权重而已？
        """
        # again, this is equivalent to a standard multilayer perceptron
        """ self.hidden(x)和self.out(x) 的时候都会执行Linear_BBB类里面的forward函数，可以加断点看到"""
        x = torch.sigmoid(self.hidden(x))
        x = self.out(x)
        return x

    def log_prior(self):
        #
        # calculate the log prior over all the layers
        return self.hidden.log_prior + self.out.log_prior

    def log_post(self):
        # calculate the log posterior over all the layers
        return self.hidden.log_post + self.out.log_post

    # samples 表示的是样本的数量

    def sample_elbo(self, input, target, samples):
        """
        这里的损失函数就是散度KL变换得到的那三个
            @:param input 表示的是输入，所有输入的特征？？
            @:param target 对应x输出的真实结果
            @:param samples
        """
        # we calculate the negative elbo, which will be our loss function
        # initialize tensors
        outputs = torch.zeros(samples, target.shape[0])
        log_priors = torch.zeros(samples)
        log_posts = torch.zeros(samples)
        log_likes = torch.zeros(samples)
        # make predictions and calculate prior, posterior, and likelihood for a given number of samples

        # 蒙特卡罗近似
        for i in range(samples):
            outputs[i] = self(input).reshape(-1)  # make predictions
            log_priors[i] = self.log_prior()  # get log prior
            log_posts[i] = self.log_post()  # get log variational posterior
            log_likes[i] = Normal(outputs[i], self.noise_tol).log_prob(
                target.reshape(-1)).sum()  # calculate the log likelihood
        # calculate monte carlo estimate of prior posterior and likelihood
        log_prior = log_priors.mean()
        log_post = log_posts.mean()
        log_like = log_likes.mean()
        # calculate the negative elbo (which is our loss function)
        loss = log_post - log_prior - log_like
        return loss


def toy_function(x):
    """
        这里只是用这个函数来提供了一个x,y的对应关系 提供训练的样本{x|y}
    """
    return -x ** 4 + 3 * x ** 2 + 1


# toy dataset we can start with
"""
    reshape(-1, 1) 把数组变成1列，行数自动计算 计算公式 数组长度/1
    reshape(1, -1)把数组变成1行，列数自动计算
    reshape(4, -1)把数组变成4行，列数自动计算 计算公式 数组长度/4
"""
x = torch.tensor([-2, -1.8, -1, 1, 1.8, 2]).reshape(-1, 1)
y = toy_function(x)

net = MLP_BBB(32, prior_var=10)
"""
    Adam优化函数
"""
optimizer = optim.Adam(net.parameters(), lr=.1)
epochs = 2000
for epoch in range(epochs):  # loop over the dataset multiple times
    """
    进行下一次batch梯度计算的时候，前一个batch的梯度计算结果，没有保留的必要了。
    所以在下一次梯度更新的时候，先使用optimizer.zero_grad把梯度信息设置为0。
    """
    optimizer.zero_grad()
    # forward + backward + optimize
    loss = net.sample_elbo(x, y, 1)
    """
    调用loss.backward()时 Pytorch的autograd就会自动沿着计算图反向传播，计算每一个叶子节点的梯度（如果某一个变量是由用户创建的，则它为叶子节点）。
    """
    # 所以这里是在反向传播嘛？
    """
    "这是大多数optimizer所支持的简化版本。一旦梯度被如backward()之类的函数计算好后，我们就可以调用这个函数。"
    https://blog.csdn.net/qq_40178291/article/details/99963586
    根据这句话的意思是，backward是在计算梯度吗？对啊！上面不是备注了嘛。。。调用loss.backward时...计算每一个叶子节点的梯度...
    """
    loss.backward()
    """
    （优化器.step()？优化器的step()方法会更新所有的参数）
    """
    # 更新参数
    optimizer.step()
    if epoch % 10 == 0:
        print('epoch: {}/{}'.format(epoch + 1, epochs))
        print('Loss:', loss.item())
print('Finished Training')

# samples is the number of "predictions" we make for 1 x-value.
samples = 100
x_tmp = torch.linspace(-5, 5, 100).reshape(-1, 1)
y_samp = np.zeros((samples, 100))
for s in range(samples):
    y_tmp = net(x_tmp).detach().numpy()
    y_samp[s] = y_tmp.reshape(-1)
plt.plot(x_tmp.numpy(), np.mean(y_samp, axis=0), label='Mean Posterior Predictive')
plt.fill_between(x_tmp.numpy().reshape(-1), np.percentile(y_samp, 2.5, axis=0), np.percentile(y_samp, 97.5, axis=0),
                 alpha=0.25, label='95% Confidence')
plt.legend()
plt.scatter(x, toy_function(x))
plt.title('Posterior Predictive')
plt.show()
if __name__ == '__main__':
    a = nn.Parameter(torch.zeros(3, 4))