本例设置了两层y=ax+b,前置条件包括x和目标y的值,初始的两组a和b的值,损失函数为mse loss,可以只跑一轮,看看反向传播各层参数是如何更新。建议在backward函数里打断点,然后将程序与图请对照着看。
# -*- coding:utf-8 -*-
# reference: https://pytorch.org/docs/stable/notes/extending.html
import torch
from torch import nn
from torch.autograd import Function, Variable
import numpy as np
from collections import OrderedDict
class LinearFunction2(Function):
# Note that both forward and backward are @staticmethods
@staticmethod
# bias is an optional argument
def forward(ctx, input, weight, bias=None):
ctx.save_for_backward(input, weight, bias)
output = input.mm(weight.t())
if bias is not None:
output += bias.unsqueeze(0).expand_as(output)
return output
# This function has only a single output, so it gets only one gradient
@staticmethod
def backward(ctx, grad_output):
# This is a pattern that is very convenient - at the top of backward
# unpack saved_tensors and initialize all gradients w.r.t. inputs to
# None. Thanks to the fact that additional trailing Nones are
# ignored, the return statement is simple even when the function has
# optional inputs.
# 第一个grad_output,是loss对y-y'的梯度。如loss_mse的梯度为2/n*(y-y'),
# 其中n为一个batch数据的数目,该loss是nx1的矩阵。