Transformer介绍
class LayerNorm(nn.Module):
def __init__(self, features, eps=1e-6):
super(LayerNorm, self).__init__()
self.a = nn.Parameter(torch.ones(features))
self.b = nn.Parameter(torch.zeros(features))
self.eps = eps
def forward(self, x):
mean = x.mean(-1, keepdim=True)
std = x.std(-1, keepdim=True)
return self.a * (x - mean) / (std + self.eps) + self.b
features = d_model = 512
eps = 1e-6
x = out_ff
ln = LayerNorm(features, eps)
out_ln = ln(x)
print(out_ln)
print(out_ln.shape)
tensor([[[ 1.0770, -0.6208, -1.2395, ..., 1.1024, 0.3222, 0.5800],
[ 2.2096, -1.9746, -1.5469, ..., 1.2817, -0.2125, -0.6497],
[ 0.9288, -0.8752, -0.3243, ..., 1.5745, -1.0322, -0.6040],
[ 1.8319, -1.5223, -1.3482, ..., 1.2740, 0.2436, -0.1593]],
[[ 0.7412, 0.9555, 0.9595, ..., 1.9401, 0.2870, 0.2000],
[ 1.0884, 0.0552, 1.5058, ..., 1.9323, -0.6559, 0.4910],
[ 1.1891, -0.1268, 1.1255, ..., 1.3577, -0.4844, 0.3856],
[ 1.3812, 0.1356, 1.3354, ..., 1.8673, -0.9188, 0.1349]]],
grad_fn=<AddBackward0>)
torch.Size([2, 4, 512])