LayerNorm深入分析
均值和方差是各自的, 但是映射是分层的
ln = nn.LayerNorm(6)
x = torch.randn(1, 1, 6)
print(ln.state_dict())
print(x)
print(ln(x))
std, mean = torch.std_mean(x, dim=2, keepdim=True, unbiased=False)
x = (x - mean) / (std + 1e-5)
print(x)
输出
OrderedDict([('weight', tensor([1., 1., 1., 1., 1., 1.])), ('bias', tensor([0., 0., 0., 0., 0., 0.]))])
tensor([[[ 6.0541e-01, 5.5038e-04, -3.6288e-01, -7.0546e-01, 2.9306e-01,
-6.5057e-01]]])
tensor([[[ 1.5400, 0.2847, -0.4695, -1.1805, 0.8918, -1.0666]]],
grad_fn=<NativeLayerNormBackward0>)
tensor([[[ 1.5401, 0.2847, -0.4695, -1.1805, 0.8918, -1.0666]]])