1.Embedding
#先做Embedding 再做Positional Encoding
class Embedding(nn.Module):
def __init__(self,d_model,vocab):
super(Embedding, self).__init__()
#vocal要做embedding的数据的长度
#d_model hidden_units
self.lut = nn.Embedding(vocab,d_model)
self.d_model = d_model
def forward(self,x):
return self.lut(x)*math.sqrt(self.d_model)
2.PositionalEncoding
class PositionalEncoding(nn.Module):
def __init__(self,d_model,dropout,max_len = 5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(dropout)
#先生成一个初始的全0的矩阵 长度*维度
pe = torch.zeros(max_len,d_model)
#生成position的初始矩阵 要与上矩阵pe的维度一致 所以采用了unsqueeze(1)
#dim = max_len
position = torch.arange(0,max_len)
#dim = max_len , 1
position = position.unsqueeze(1)
#参照论文公式
cs_data = torch.exp(torch.arange(0,d_model,2)*
-(math.log(10000.0)/d_model))
print(position.size(),cs_data.size())
#在d_model维做的运算 因为maxlen对应的序列
pe[:,0::2] = torch.sin(position * cs_data)
pe[:,1::2] = torch.cos(position * cs_data)
#给batch_size留出空间
pe = pe.unsqueeze(0)
#使用register_buffer定义一组参数
self.register_buffer('pe',pe)
def forward(self,x):
#输入的x叠加相同的位置编码,之后使用dropout
x = x + self.pe[:,:x.size(1)].requires_grad_(False)
x = self.dropout(x)
return x
3.Attention & MultiHeadAttention
#Attention是单头的实现 Multi-Head中每一个的计算会用到Attention
def Attention(Q, K, V, mask = None,dropout = None):
#Q来自目标序列举例(30, 8, 10, 64) K,V来自源序列举例(30, 8, 11, 64)
#30 batch_size
#8 head_nums
#10 len
#64 hidden_units
#拿到hidden_untis的维度 -1代表此KQV最后一维代表hidden_units
d_k = Q.size(-1)
#transpose 对矩阵做变换 类似于转置
#这里为什么是最后两维度 是因为上面的最后两位代表我们实际的数据
#这里除以sqrt(d_k)=8 防止过大的亲密度
#(30, 8, 10, 64) * (30, 8, 64, 11)-->(30, 8, 10, 11)
score = torch.matmul(Q,K.transpose(-2,-1))/math.sqrt(d_k)
if mask is not None:
score = score.masked_fill(mask==0, -1e9)
#对score的最后一维做softmax,得到的维度与初始一致
#softmax:输出概率分布 argmax:输出最大值
p_attn = F.softmax(score,dim = -1)
#这里的dropout是Multi-Attentian里面传来的nn.Dropout(p),所以直接使用就行
if dropout is not None:
p_attn = dropout()
#返回的第一个值是最终结果 第二个为attention分数
return torch.matmul(p_attn,V),p_attn
class MultiHeadAttention(nn.Module):
def __init__(self,h,d_model,dropout=0.1):
super(MultiHeadAttention, self).__init__()
self.d_k = d_model // h
#声明四个linear 前三个是做QKV的投影 最后一个是做完attention后的linear
self.linear = clones(nn.Linear(d_model,d_model),4)
self.h = h
self.attn = None
self.drop = nn.Dropout(dropout)
def forward(self,Q,K,V,mask=None):
#这里的Q(30,10,512)(batch,len,hidden)
#K,V (30,11,512)(batch,len,hidden)
if mask is not None:
#原始的QKV做了列升维 所以这里的mask也要做列升维
mask = mask.unsqueeze(1)
#这一步就是拿到batch
n_batches = Q.size(0)
#这里就是将初始的QKV转变为(batch,len,head,hidden)-->再做transpose转变
#为attention的形式 (batch,head,len,hidden)
#Q(30,10,512)-->view-->(30,10,8,64)-->trasnpose-->(30,8,10,64)
#这样做其实是将多头的拆分成了head个单头的去做 虽然是多头 计算上的时空复杂度近似于单头
Q,K,V = [l(x).view(n_batches,-1,self.h,self.d_k).transpose(1,2)
for l,x in zip(self.linear,(Q,K,V))]
#l-->linear x-->QKV
#输入到Attention中 得到最终结果以及atention分数
x,self.attn = Attention(Q,K,V,mask=mask,dropout=self.drop)
#现在要把所得结果转换为输入的初始形式
x = x.transpose(1,2).contiguous().view(n_batches,-1,self.h*self.d_k)
#先将上一步对调的维度转换回来,再对维度重新塑性
del Q
del K
del V
#第四个linear是对attention的结果做linear
return self.linear[-1](x)
4.PositionWiseFeedward
class PositionWiseFeedForward(nn.Module):
def __init__(self,input,output,dropout):
super(PositionWiseFeedForward, self).__init__()
self.input = input
self.output = output
self.w_1 = nn.Linear(input,output)
self.w_2 = nn.Linear(output,input)
self.drop = nn.Dropout(dropout)
def forward(self,x):
return self.w_2(self.drop(F.relu(self.w_1(x))))
5.LayerNorm&Residual&Drop
class LayerNorm(nn.Module):
#再次区分LayerNorm与BatchNorm的区别!!!
def __init__(self,size,eps=1e-6):
super(LayerNorm, self).__init__()
self.a_2 = nn.Parameter(torch.ones(size))
self.b_2 = nn.Parameter(torch.ones(size))
self.size = size
self.eps = eps
def forward(self,x):
mean = x.mean(-1,keepdim=True)
std = x.std(-1,keepdim=True)
return self.a_2*(x-mean)/(std+self.eps)+self.b_2
6.SublayerConnection
#Encoder中的两个部分:Multi-Attention 与 PositionWiseFeedForward
class SublayerConnection(nn.Module):
def __init__(self,size,dropout):
super(SublayerConnection, self).__init__()
self.norm = LayerNorm(size)
self.drop = nn.Dropout(dropout)
# x输入 sublayer代表Multi-Attention 或是 Position-wise feedward network
def forward(self,x,sublayer):
#这里有一个残差连接
return x+self.drop(sublayer(self.norm(x)))
7.Encoder & EncoderLayer
EncoderLayer:堆叠Encoder层 其中会初始化SublayerConnection,在forward阶段就是借助Sublayer分别做Attention与FF的计算
#构建Encoder层
class EncoderLayer(nn.Module):
#传进来的分别是
#size --> hidden units
#self_attn --> Multi-Head-Attention
#feed_ward -->PositionWiseFeedforward
def __init__(self,size,self_attn,feed_forward,dropout):
super(EncoderLayer, self).__init__()
self.self_attn = self_attn
self.feed_forward = feed_forward
self.drop_out = dropout
#SubLayerConnection这一步只是做了__init__步骤,即初始化Dropout 与 LayerNorm
self.sublayer = clones(SublayerConnection(size,dropout),2)
self.size = size
def forward(self,x,mask):
#这边forward部分就是对以上形成得SubLayerConnection加入MultiHeadAttention
#与PositionalFeedForward
#这个lambda函数相当于输入x 返回self_attn(x,x,x,mask)
x = self.sublayer[0](x,lambda x:self.self_attn(x,x,x,mask))
return self.sublayer[1](x,self.feed_forward)
#堆叠Encoder层
class Encoder(nn.Module):
#这里传进来得就是EencoderLayer 对EncoderLayer 进行堆叠
def __init__(self,layer,N):
super(Encoder, self).__init__()
self.layer = clones(layer,N)
self.norm = LayerNorm(layer.size)
def forward(self,x,mask):
for layer in self.layer:
x = layer(x,mask)
return self.norm(x)
8.Decoder & DecoderLayer
同上
class DecoderLayer(nn.Module):
def __init__(self,size,self_attn,src_attn,feed_forward,dropout):
super(DecoderLayer, self).__init__()
self.self_attn = self_attn
self.src_attn = src_attn
self.feed_forward = feed_forward
self.size = size
self.sublayer = clones(SublayerConnection(size,dropout),3)
def forward(self,x,memory,src_mask,tgt_mask):
#来自Encoder得输出
m = memory
x = self.sublayer[0](x,lambda x:self.self_attn(x,x,x,tgt_mask))
x = self.sublayer[1](x,lambda x:self.src_attn(x,m,m,src_mask))
x = self.sublayer[2](x,self.feed_forward)
return x
class Decoder(nn.Module):
def __init__(self,layer,N):
super(Decoder, self).__init__()
self.layer = clones(layer,N)
self.norm = LayerNorm(layer.size)
def forward(self,x,memory,src_mask,tgt_mask):
for layer in self.layer:
x = layer(x,memory,src_mask,tgt_mask)
return self.norm(x)
9. EncoderDecoder
这部分是用来构建完整模型的,就是将Encoder与Decoder连接成一个大的模型
class EncoderDecoder(nn.Module):
def __init__(self,encoder,decoder,src_emb,tgt_emb,generator):
super(EncoderDecoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
self.src_emb = src_emb
self.tgt_emb = tgt_emb
self.generator = generator
def forward(self,src,tgt,src_mask,tgt_mask):
return self.decode(self.encode(src,src_mask),src_mask,tgt,tgt_mask)
#进行EncoderLayer的部分
def encode(self,src,src_mask):
return self.encoder(self.src_emb(src),src_mask)
#进行DecoderLayer的部分
def decode(self,memory,src_mask,tgt,tgt_mask):
return self.decoder(self.tgt_emb(tgt),memory,src_mask,tgt_mask)
10. Generator
模型最后的输出部分,一个Linear,一个Softmax
#最后一个模块 做输出
class Generator(nn.Module):
def __init__(self,d_model,vocab):
#d_model 模型得hidden_units
#vocab 目标输出的大小
super(Generator, self).__init__()
self.linear = nn.Linear(d_model,vocab)
def forward(self,x):
#先做一个线性变换
#再对最后一个维度做softmax
return F.softmax(self.linear(x),dim=-1)