文本相似度(pytorch版本) Bi-Lstm+Attention

最近在学习文本相似度方向的知识,顺便做了个实验,摸索了好长时间

网络结构是Bi-LSTM(hidden=128)+Attention+FC

导入相关包
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from word2vec_util import Vocab_size,Embedding_matrix,get_dataloader
定义网络结构
class test_model(nn.Module):
    def __init__(self):
        super(test_model, self).__init__()
        self.Vocab_size = Vocab_size #词典大小
        self.batch_size = 500 
        self.input_size = 300 #和embedding_size作用一样
        self.n_hidden1 = 128 #bi-lstm的隐藏层大小
        self.Embedding_dim = 300 #词嵌入维度,这里我使用了word2vec预训练的词向量
        self.n_class=2 #相似和不相似两种分类
        self.dropout = nn.Dropout(0.5) #dropout设置为0.5,不知道后面起没起作用
        self.Embedding_matrix = Embedding_matrix #词嵌入矩阵,#size=[Vocab_size,embedding_size],可以自己训练一个词向量矩阵。
        self.word_embeds = nn.Embedding(self.Vocab_size, self.Embedding_dim) #嵌入层
        pretrained_weight = np.array(self.Embedding_matrix) #转换成numpy类型
        self.word_embeds.weight.data.copy_(torch.from_numpy(pretrained_weight)) #将词嵌入矩阵放到网络结构中
        self.Bi_Lstm1 = nn.LSTM(self.Embedding_dim, hidden_size=self.n_hidden1, bidirectional=True) #bi-lstm,hidden_size = 128

        self.fc = nn.Linear(self.n_hidden1*2,self.n_class,bias=False) #根据attention后的输出,全连接层的大小设置为(256,2)
        self.b = nn.Parameter(torch.rand([self.n_class])) #偏置b
        pass
        
    def attention_weight1(self,outputs1,final_state1): #最好debug一步一步看怎么变化
        outputs1  = outputs1.permute(1,0,2)
        hidden = final_state1.view(-1,self.n_hidden1*2,1)
        attention_weights = torch.bmm(outputs1,hidden).squeeze(2) #z=torch.bmm(x,y)针对三维数组相乘,x=[a,b,c],y =[a,c,d], z = [a,b,c],这里的a,b,c,d都是size.
        soft_attention_weights1 = F.softmax(attention_weights,1)
        context1 = torch.bmm(outputs1.transpose(1,2),soft_attention_weights1.unsqueeze(2)).squeeze(2)
        return context1,soft_attention_weights1
        pass

    def forward(self,train_left,train_right):
        train_left = self.word_embeds(train_left).to(device) #train_left为索引,mapping词向量,得到一个词向量矩阵,size(12,300)
        train_right = self.word_embeds(train_right).to(device) #同上

        train_left = train_left.transpose(0,1) #交换维度 ,变为[seq_len,batch_size,embedding_dim]
        train_right = train_right.transpose(0,1)

        hidden_state1 = torch.rand(2,self.batch_size,self.n_hidden1).to(device) #隐藏层单元初始化
        cell_state1 = torch.rand(2,self.batch_size,self.n_hidden1).to(device)

        outputs1_L,(final_state1_L,_) =self.Bi_Lstm1(train_left,(hidden_state1,cell_state1))#左右两边参数共享
        outputs1_L = self.dropout(outputs1_L) #左边输出
        attn_outputs1_L,attention1_L = self.attention_weight1(outputs1_L,final_state1_L)#左右两边attention也共享

        outputs1_R,(final_state1_R,_) =self.Bi_Lstm1(train_right,(hidden_state1,cell_state1))
        outputs1_R = self.dropout(outputs1_R)
        attn_outputs1_R, attention1_R = self.attention_weight1(outputs1_R, final_state1_R)

        outputs1 = attn_outputs1_L #attention后的输出
        outputs2 = attn_outputs1_R

        output = torch.abs(outputs1 - outputs2) #采用的是曼哈顿距离(两个向量相减的绝对值),也可以使用别的距离
        output = self.fc(output)+self.b #全连接,得到二分类
        output = F.softmax(output,dim=1) #softmax函数归一
        return output
        pass
训练部分
def train(model, device, train_loader, optimizer,criterion,epoch):
    print('Training on {} samples...'.format(len(train_loader.dataset)))
    model.train()
    train_loss = 0
    num_correct = 0
    for batch_idx,(train_left,train_right,lables) in enumerate(train_loader):#返回的是元组,记得加括号
        train_left = train_left.to(device)
        train_right = train_right.to(device)
        lables = lables.to(device)
        optimizer.zero_grad()
        output = model(train_left,train_right)
        loss = criterion(output,lables)
        loss.backward()
        optimizer.step()
        train_loss += float(loss.item())
        true = lables.data.cpu()
        predict = torch.max(output,dim=1)[1].cpu()
        num_correct += torch.eq(predict,true).sum().float().item()
    train_acc = num_correct / len(train_loader.dataset)
    train_loss = train_loss/len(train_loader)
    msg = 'Epoch: {0:>5},  Train Loss: {1:>5.5},  Train Acc: {2:>6.4%}'
    print(msg.format(epoch,train_loss,train_acc))
训练
if __name__ == '__main__':
    Epochs = 60
    Learn_rate = 0.001
    device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')
    dataloder = get_dataloader()
    Bi_LstmModel= test_model().to(device)
    criterion = nn.CrossEntropyLoss().to(device)
    #criterion = ContrastiveLoss().to(device)
    optimizer = optim.Adam(Bi_LstmModel.parameters(),lr=Learn_rate)
    for epoch in range(Epochs):
        train(Bi_LstmModel, device, dataloder, optimizer, criterion,epoch+1)
结果
length of vocabs:5956
5956
Found 5956 word vectors.
average_length: 6.520725
max_length: 12
Training on 100000 samples...
Epoch:     1,  Train Loss: 0.64095,  Train Acc: 62.9720%
Training on 100000 samples...
Epoch:     2,  Train Loss: 0.58864,  Train Acc: 70.8260%
Training on 100000 samples...
Epoch:     3,  Train Loss: 0.55469,  Train Acc: 74.9770%
Training on 100000 samples...
Epoch:     4,  Train Loss: 0.52636,  Train Acc: 78.2610%
Training on 100000 samples...
Epoch:     5,  Train Loss: 0.50418,  Train Acc: 80.7080%
Training on 100000 samples...
Epoch:     6,  Train Loss: 0.48491,  Train Acc: 82.8620%
Training on 100000 samples...
Epoch:     7,  Train Loss: 0.46984,  Train Acc: 84.5690%
Training on 100000 samples...
Epoch:     8,  Train Loss: 0.456,  Train Acc: 85.9410%
Training on 100000 samples...
Epoch:     9,  Train Loss: 0.44567,  Train Acc: 87.0060%
Training on 100000 samples...
Epoch:    10,  Train Loss: 0.43607,  Train Acc: 88.0130%
Training on 100000 samples...
Epoch:    11,  Train Loss: 0.42797,  Train Acc: 88.8480%
Training on 100000 samples...
Epoch:    12,  Train Loss: 0.42026,  Train Acc: 89.5970%
Training on 100000 samples...
Epoch:    13,  Train Loss: 0.41379,  Train Acc: 90.2630%
Training on 100000 samples...
Epoch:    14,  Train Loss: 0.40766,  Train Acc: 90.8920%
Training on 100000 samples...
Epoch:    15,  Train Loss: 0.40276,  Train Acc: 91.3650%
Training on 100000 samples...
Epoch:    16,  Train Loss: 0.39678,  Train Acc: 91.9920%
Training on 100000 samples...
Epoch:    17,  Train Loss: 0.39264,  Train Acc: 92.3740%
Training on 100000 samples...
Epoch:    18,  Train Loss: 0.38861,  Train Acc: 92.7450%
Training on 100000 samples...
Epoch:    19,  Train Loss: 0.38506,  Train Acc: 93.0790%
Training on 100000 samples...
Epoch:    20,  Train Loss: 0.3821,  Train Acc: 93.4100%
Training on 100000 samples...
Epoch:    21,  Train Loss: 0.37889,  Train Acc: 93.7120%
Training on 100000 samples...
Epoch:    22,  Train Loss: 0.37757,  Train Acc: 93.8200%
Training on 100000 samples...
Epoch:    23,  Train Loss: 0.37417,  Train Acc: 94.0930%
Training on 100000 samples...
Epoch:    24,  Train Loss: 0.3715,  Train Acc: 94.4140%
Training on 100000 samples...
Epoch:    25,  Train Loss: 0.36956,  Train Acc: 94.5810%
Training on 100000 samples...
Epoch:    26,  Train Loss: 0.36763,  Train Acc: 94.7730%
Training on 100000 samples...
Epoch:    27,  Train Loss: 0.36562,  Train Acc: 94.9750%
Training on 100000 samples...
Epoch:    28,  Train Loss: 0.36378,  Train Acc: 95.1200%
Training on 100000 samples...
Epoch:    29,  Train Loss: 0.36189,  Train Acc: 95.3260%
Training on 100000 samples...
Epoch:    30,  Train Loss: 0.36093,  Train Acc: 95.4330%
Training on 100000 samples...
Epoch:    31,  Train Loss: 0.35879,  Train Acc: 95.6250%
Training on 100000 samples...
Epoch:    32,  Train Loss: 0.35819,  Train Acc: 95.6460%
Training on 100000 samples...
Epoch:    33,  Train Loss: 0.35632,  Train Acc: 95.8660%
Training on 100000 samples...
Epoch:    34,  Train Loss: 0.35541,  Train Acc: 95.9820%
Training on 100000 samples...
Epoch:    35,  Train Loss: 0.35451,  Train Acc: 96.0200%
Training on 100000 samples...
Epoch:    36,  Train Loss: 0.35287,  Train Acc: 96.1810%
Training on 100000 samples...
Epoch:    37,  Train Loss: 0.35191,  Train Acc: 96.2920%
Training on 100000 samples...
Epoch:    38,  Train Loss: 0.35084,  Train Acc: 96.4330%
Training on 100000 samples...
Epoch:    39,  Train Loss: 0.35073,  Train Acc: 96.3850%
Training on 100000 samples...
Epoch:    40,  Train Loss: 0.34976,  Train Acc: 96.5080%
Training on 100000 samples...
Epoch:    41,  Train Loss: 0.34839,  Train Acc: 96.6140%
Training on 100000 samples...
Epoch:    42,  Train Loss: 0.34836,  Train Acc: 96.6310%
Training on 100000 samples...
Epoch:    43,  Train Loss: 0.34741,  Train Acc: 96.6780%
Training on 100000 samples...
Epoch:    44,  Train Loss: 0.34707,  Train Acc: 96.6900%
Training on 100000 samples...
Epoch:    45,  Train Loss: 0.34599,  Train Acc: 96.8830%
Training on 100000 samples...
Epoch:    46,  Train Loss: 0.34547,  Train Acc: 96.9310%
Training on 100000 samples...
Epoch:    47,  Train Loss: 0.34464,  Train Acc: 96.9570%
Training on 100000 samples...
Epoch:    48,  Train Loss: 0.34371,  Train Acc: 97.0530%
Training on 100000 samples...
Epoch:    49,  Train Loss: 0.3439,  Train Acc: 97.0470%
Training on 100000 samples...
Epoch:    50,  Train Loss: 0.34344,  Train Acc: 97.0800%
Training on 100000 samples...
Epoch:    51,  Train Loss: 0.34239,  Train Acc: 97.1990%
Training on 100000 samples...
Epoch:    52,  Train Loss: 0.34175,  Train Acc: 97.2850%
Training on 100000 samples...
Epoch:    53,  Train Loss: 0.34119,  Train Acc: 97.3100%
Training on 100000 samples...
Epoch:    54,  Train Loss: 0.34126,  Train Acc: 97.2960%
Training on 100000 samples...
Epoch:    55,  Train Loss: 0.34068,  Train Acc: 97.3470%
Training on 100000 samples...
Epoch:    56,  Train Loss: 0.34012,  Train Acc: 97.3980%
Training on 100000 samples...
Epoch:    57,  Train Loss: 0.33972,  Train Acc: 97.4420%
Training on 100000 samples...
Epoch:    58,  Train Loss: 0.3399,  Train Acc: 97.4150%
Training on 100000 samples...
Epoch:    59,  Train Loss: 0.33941,  Train Acc: 97.4970%
Training on 100000 samples...
Epoch:    60,  Train Loss: 0.33927,  Train Acc: 97.4890%
  • 3
    点赞
  • 19
    收藏
    觉得还不错? 一键收藏
  • 9
    评论
下面是基于PyTorch搭建CNN-LSTM-Attention的示例代码: ```python import torch import torch.nn as nn import torch.nn.functional as F class CNN_LSTM_Attention(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout, output_dim, kernel_sizes, num_filters, bidirectional): super().__init__() # 定义嵌入层 self.embedding = nn.Embedding(vocab_size, embedding_dim) # 定义卷积层 self.convs = nn.ModuleList([ nn.Conv2d(in_channels=1, out_channels=num_filters, kernel_size=(fs, embedding_dim)) for fs in kernel_sizes ]) # 定义LSTM层 self.lstm = nn.LSTM(num_filters * len(kernel_sizes), hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional) # 定义attention层 self.attention = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, 1) # 定义全连接层 self.fc = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, output_dim) # 定义dropout self.dropout = nn.Dropout(dropout) def forward(self, text): # text: [batch_size, sent_len] # 嵌入 embedded = self.embedding(text) # embedded: [batch_size, sent_len, emb_dim] # 变形 embedded = embedded.unsqueeze(1) # embedded: [batch_size, 1, sent_len, emb_dim] # 卷积 conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] # conved: [batch_size, num_filters, sent_len - fs + 1] # 池化 pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] # pooled: [batch_size, num_filters] # 拼接 cat = self.dropout(torch.cat(pooled, dim=1)) # cat: [batch_size, num_filters * len(kernel_sizes)] # LSTM output, (hidden, cell) = self.lstm(cat.unsqueeze(0)) # output: [1, batch_size, hidden_dim * num_directions], hidden: [num_layers * num_directions, batch_size, hidden_dim], cell: [num_layers * num_directions, batch_size, hidden_dim] # attention attention_weights = F.softmax(self.attention(output.squeeze(0)), dim=1) # attention_weights: [batch_size, 1, hidden_dim * num_directions] attention_output = torch.bmm(attention_weights.transpose(1, 2), output.transpose(0, 1)).squeeze(1) # attention_output: [batch_size, hidden_dim * num_directions] # 全连接 return self.fc(self.dropout(attention_output)) ``` 此模型采用了CNN-LSTM-Attention结构,其中包含了嵌入层、卷积层、LSTM层、attention层和全连接层。在前向传播过程中,先将输入的文本通过嵌入层转换为词向量,然后通过多个不同大小的卷积核提取文本的不同特征,接着通过最大池化操作将各个特征的值取最大,最后将各个特征拼接起来输入到LSTM层中进行序列建模。在LSTM层之后,通过attention层对LSTM层的输出进行加权平均,得到文本的表示,最后通过全连接层输出分类结果。
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值