继续解读t5代码之中源码的内容
回到t5的整体结构之中
回到t5模型的整体的结构之中
(0): T5LayerSelfAttention(
(SelfAttention): T5Attention(
(q): Linear(in_features=512, out_features=512, bias=False)
(k): Linear(in_features=512, out_features=512, bias=False)
(v): Linear(in_features=512, out_features=512, bias=False)
(o): Linear(in_features=512, out_features=512, bias=False)
(relative_attention_bias): Embedding(32, 8)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
(1): T5LayerFF(
(DenseReluDense): T5DenseReluDense(
(wi): Linear(in_features=512, out_features=2048, bias=False)
(wo): Linear(in_features=2048, out_features=512, bias=False)
(dropout): Dropout(p=0.1, inplace=False)
)
(layer_norm): T5LayerNorm()
(dropout): Dropout(p=0.1, inplace=False)
)
这里我们接着查看一下T5DenseReluDense模型的结构特点
class T5DenseReluDense(nn.Module):
def __init__(self, config):
super().__init__()
self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
self.dropout = nn.Dropout(config.dropout_rate)
def forward(self, hidden_states):
hidden_states = self.wi(hidden_states)
hidden_states = nn.functional.relu(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.wo(hidden_states)
return hidden_states
这里内容的参数之中,
config.d_model = 512
config.d_ff = 2048
这里还需要注意一下T5LayerNorm网络层结构的实现
先copy一下T5LayerNorm整个部分的源代码
class T5LayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
"""
Construct a layernorm module in the T5 style No bias and no subtraction of mean.
"""
super().__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.variance_epsilon = eps
def forward(self, hidden_states):
# layer norm should always be calculated in float32
variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
# convert into float16 if necessary
if self.weight.dtype == torch.float16:
hidden_states = hidden_states.to(torch.float16)
return self.weight * hidden_states
常规的layernormalization的计算公式为
y = x − μ V a r ( x ) + ϵ y = \frac{x-\mu}{\sqrt{Var(x)+\epsilon}} y=Var(x)+ϵx−μ
这里的 μ \mu μ为期望, ϵ \epsilon ϵ为方差
而新更新的layernormalization的计算公式为
y = w e i g h t ∗ x V a r ( x ) + ϵ y = weight*\frac{x}{\sqrt{Var(x)+\epsilon}} y=weight∗