import torch
from torch.nn import functional
x=torch.ones(1,requires_grad=True)
print(x)
w=torch.full([1],6,requires_grad=True)
mse=functional.mse_loss(x,x*w)
print(mse)#(1-6)**2
print(torch.autograd.grad(mse,[w]))#2*(1-6)*(-1)
mse=functional.mse_loss(x,x*w)
mse.backward()
print(w.grad)
a=torch.rand(3,requires_grad=True)
print(a)
p=functional.softmax(a,dim=0)
print(p)
print(torch.autograd.grad(p[1],[a],retain_graph=True))#-0.0.2751*0.4222,0.2751*(1-0.2751),-0.2751* 0.3027
结果:tensor([1.], requires_grad=True)
tensor(25., grad_fn=<MeanBackward0>)
(tensor([10.]),)
tensor([10.])
tensor([0.9570, 0.5285, 0.6240], requires_grad=True)
tensor([0.4222, 0.2751, 0.3027], grad_fn=<SoftmaxBackward>)
(tensor([-0.1162, 0.1994, -0.0833]),)
附图softmax求导结果: