MultiHeadAttention的代码实现

转载自MultiHeadAttention实现详解

import torch
import torch.nn as nn
import torch.nn.functional as F
import math, copy

def clones(module, N):
    "Produce N identical layers."
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim = -1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4) # create 4 linear layers
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)
        
    def forward(self, query, key, value, mask=None):
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        batch_size = query.size(0)
        
        print ('Before transform query: ' + str(query.size())) # (batch_size, seq_length, d_model)        

        query, key, value = [l(x) for l, x in zip(self.linears, (query, key, value))] # (batch_size, seq_length, d_model), use first 3 self.linears
        query, key, value = [x.view(batch_size, -1, self.h, self.d_k).transpose(1, 2)
                             for x in (query, key, value)] # (batch_size, h, seq_length, d_k)
                
        print ('After transform query: ' + str(query.size()))
        
        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        
        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.h * self.d_k)
        return self.linears[-1](x)

    
h = 8
d_model = 512
batch_size = 1
seq_length = 10
model = MultiHeadAttention(h, d_model)

query = torch.randn([batch_size, seq_length, d_model])
key = query
value = query

print ('Input size: ' + str(query.size()))

m = model(query, key, value)

print ('Output size: ' + str(m.size()))

也可以用使用pytorch自带的类nn.MultiheadAttention,可以查看官方文档
使用方法如下:

multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
attn_output, attn_output_weights = multihead_attn(query, key, value)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是使用PyTorch实现Multi-Head Attention模块的代码: ```python import torch import torch.nn as nn import torch.nn.functional as F class MultiHeadAttention(nn.Module): def __init__(self, n_heads, d_model): super(MultiHeadAttention, self).__init__() self.n_heads = n_heads self.d_model = d_model # Linear layers for query, key and value projections self.wq = nn.Linear(d_model, d_model) self.wk = nn.Linear(d_model, d_model) self.wv = nn.Linear(d_model, d_model) # Linear layer for final output self.fc = nn.Linear(d_model, d_model) def forward(self, query, key, value, mask=None): batch_size = query.size(0) # Project query, key and value using linear layers Q = self.wq(query).view(batch_size, -1, self.n_heads, self.d_model // self.n_heads).transpose(1,2) # (batch_size, n_heads, seq_len, d_model // n_heads) K = self.wk(key).view(batch_size, -1, self.n_heads, self.d_model // self.n_heads).transpose(1,2) # (batch_size, n_heads, seq_len, d_model // n_heads) V = self.wv(value).view(batch_size, -1, self.n_heads, self.d_model // self.n_heads).transpose(1,2) # (batch_size, n_heads, seq_len, d_model // n_heads) # Compute scaled dot product attention for each head scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.d_model // self.n_heads, dtype=torch.float32)) # (batch_size, n_heads, seq_len, seq_len) if mask is not None: scores = scores.masked_fill(mask == 0, -1e9) attention = F.softmax(scores, dim=-1) output = torch.matmul(attention, V) # (batch_size, n_heads, seq_len, d_model // n_heads) # Concatenate heads and apply final linear layer output = output.transpose(1,2).contiguous().view(batch_size, -1, self.n_heads * (self.d_model // self.n_heads)) # (batch_size, seq_len, d_model) output = self.fc(output) return output ``` 该模块有三个输入:query,key和value,它们都是形状为(batch_size, seq_len, d_model)的张量。MultiHeadAttention首先使用三个线性层将query、key和value投影到d_model维空间,并将它们重塑为(batch_size, n_heads, seq_len, d_model // n_heads)的形状。然后,它对每个头计算缩放点积注意力,并将注意力权重和value相乘得到每个头的输出。最后,它将所有头的输出连接在一起并通过一个线性层输出最终的结果。 在forward方法中,我们首先计算query、key和value的投影。然后我们计算每个头的注意力权重,将它们与value相乘,并将每个头的输出连接在一起。最后,我们使用线性层将输出投影到d_model维空间。如果提供了掩码,则会将其应用于注意力权重。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值