pytorch 实现Transformer encoder

import torch
from torch import nn
import torch.nn.functional as F
import math

class AttentionHead(nn.Module):
    def __init__(self, embed_dim, head_dim):
        super().__init__()
        self.q = nn.Linear(embed_dim, head_dim)
        self.k = nn.Linear(embed_dim, head_dim)
        self.v = nn.Linear(embed_dim, head_dim)
        self.dropout = nn.Dropout(0.2)

    def forward(self, query, key, value, mask=None):
        query, key, value = self.q(query), self.k(key), self.v(value)
        scores = torch.bmm(query, key.transpose(1, 2)) / math.sqrt(query.size(-1))
        if mask is not None:
            mask = mask.unsqueeze(dim=1).repeat(1, mask.size(1), 1)  # [batch_size, seq_len, seq_len]
            assert scores.size() == mask.size()
            scores = scores.masked_fill(mask == 0, -float("inf"))
        weights = self.dropout(F.softmax(scores, dim=-1))
        return torch.bmm(weights, value)

class SequentialMultiHeadAttention(nn.Module):
    def __init__(self, hidden_size, num_heads):
        super().__init__()
        head_dim = hidden_size // num_heads
        self.heads = nn.ModuleList(
            [AttentionHead(hidden_size, head_dim) for _ in range(num_heads)]
        )
        self.output_linear = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(0.2)

    def forward(self, query, key, value, attn_mask=None, query_mask=None, key_mask=None):
        if query_mask is not None and key_mask is not None:
            attn_mask = torch.bmm(query_mask.unsqueeze(-1), key_mask.unsqueeze(1))
        x = torch.cat([h(query, key, value, attn_mask) for h in self.heads], dim=-1)
        x = self.dropout(self.output_linear(x))
        return x



class ScaledDotProductAttention(nn.Module):
    def __init__<
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值