pytorch RNN原理实现词性判别以及预测下一个词

最新推荐文章于 2024-09-10 01:31:19 发布

Happy丶lazy

最新推荐文章于 2024-09-10 01:31:19 发布

阅读量1.9k

点赞数

分类专栏： NLP pytorch 文章标签：词性判别预测词语 RNN LSTM 自然语言处理

本文链接：https://blog.csdn.net/qq_39309652/article/details/116245606

版权

pytorch 同时被 2 个专栏收录

9 篇文章 3 订阅

订阅专栏

NLP

7 篇文章 1 订阅

订阅专栏

本文探讨了卷积神经网络在利用卷积核共享参数和空间信息的优势，以及在处理有顺序依赖的数据如语言中的局限性，进而介绍了RNN在时间序列分析中的重要作用，从自回归模型到N-gram模型的发展，以及如何通过RNN解决长期依赖问题。

摘要由CSDN通过智能技术生成

卷积神经网络利用卷积核的方式来共享参数，使得参数量大大降低的同时还可以利用空间信息，但是对有先后顺序有关的数据就没多大优势
在这里插入图片描述

当改变位置信息后还是原来的数据，不会有变换，就比如一句话，我喜欢你，你喜欢我，虽然一样多的词的，但是表达意思就是不一样，所以产生RNN

时间序列

是指将同一统计指标的数值按其发生的时间先后顺序排列而成的数列。时间序列分析的主要目的是根据已有的历史数据对未来进行预测。

在时间序列问题上，观察值具有时间先后的特征，历史数据可以影响未来数据的表达，因此需要网
络具有记忆能力

最开始有自回归模型，将所有数据的前面数据全部拿到，但是参数过多且运算麻烦，
在这里插入图片描述

之后产生 N-gram 语言模型：使用固定宽度的窗口

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Yni789FS-1619622800219)(attachment:image.png)]

缺点：受限于窗口的宽度，N并不能取很大的值，会出现长期依赖缺失的问题。
在这里插入图片描述

一般我们会通过他在法国，推测出他学会说法语，但是时间跨度太大，数据容易丢失
在这里插入图片描述

RNN参数

torch.nn.RNN(input_size, hidden_size, num_layers)

必选参数 input_size，指定输入序列中单个样本的尺寸大小，例如可能用一个 1000 长度的向量表示一个单词，则 input_size=1000
必选参数 hidden_size，指的是隐藏层中输出特征的大小
必选参数 num_layers，指的是纵向的隐藏层个数，一般设置为 1~10，default=1

pytorch 实现词性判别

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torchvision import transforms

#定义训练数据
training_data = [
    ("The cat ate the fish".split(), ["DET", "NN", "V", "DET", "NN"]),
    ("They read that book".split(), ["NN", "V", "DET", "NN"])
]
#定义测试数据
testing_data=[("They ate the fish".split())]

testing_data

[['They', 'ate', 'the', 'fish']]

word_to_ix = {} # 单词的索引字典
for sent, tags in training_data:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
print(word_to_ix)

{'The': 0, 'cat': 1, 'ate': 2, 'the': 3, 'fish': 4, 'They': 5, 'read': 6, 'that': 7, 'book': 8}

tag_to_ix = {"DET": 0, "NN": 1,'V':2} # 手工设定词性标签数据字典

构建网络

class LSTMTagger(nn.Module):
 
    def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
 
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
 
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
 
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.hidden = self.init_hidden()
 
    #初始化隐含状态State及C
    def init_hidden(self):
        return (torch.zeros(1, 1, self.hidden_dim),
                torch.zeros(1, 1, self.hidden_dim))
 
    def forward(self, sentence):
        #获得词嵌入矩阵embeds
        embeds = self.word_embeddings(sentence)   
        #按lstm格式，修改embeds的形状
        lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
        #修改隐含状态的形状，作为全连接层的输入
        tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
        #计算每个单词属于各词性的概率
        tag_scores = F.log_softmax(tag_space,dim=1)
        return tag_scores

def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    tensor = torch.LongTensor(idxs)
    return tensor

len(tag_to_ix)

EMBEDDING_DIM=10
HIDDEN_DIM=3  #这里等于词性个数

model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))
loss_function = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(training_data[0][0])
print(inputs)
print(tag_scores)
print(torch.max(tag_scores,1))

['The', 'cat', 'ate', 'the', 'fish']
tensor([0, 1, 2, 3, 4])
tensor([[-1.2455, -0.7574, -1.4134],
        [-1.2678, -0.7575, -1.3875],
        [-1.0882, -0.9860, -1.2375],
        [-1.1371, -0.9226, -1.2667],
        [-1.2417, -0.7801, -1.3754]], grad_fn=<LogSoftmaxBackward>)
torch.return_types.max(
values=tensor([-0.7574, -0.7575, -0.9860, -0.9226, -0.7801], grad_fn=<MaxBackward0>),
indices=tensor([1, 1, 1, 1, 1]))

for epoch in range(4): # 我们要训练400次。
    for sentence, tags in training_data:
# 清除网络先前的梯度值
        model.zero_grad()
# 重新初始化隐藏层数据
        model.hidden = model.init_hidden()
# 按网络要求的格式处理输入数据和真实标签数据
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = prepare_sequence(tags, tag_to_ix)
# 实例化模型
        tag_scores = model(sentence_in)
# 计算损失，反向传递梯度及更新模型参数
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
 
# 查看模型训练的结果
inputs = prepare_sequence(training_data[0][0], word_to_ix)
tag_scores = model(inputs)
print(training_data[0][0])
print(tag_scores)
print(torch.max(tag_scores,1))

['The', 'cat', 'ate', 'the', 'fish']
tensor([[-1.2365, -0.7653, -1.4090],
        [-1.2602, -0.7555, -1.3997],
        [-1.0703, -0.9892, -1.2545],
        [-1.1131, -0.9252, -1.2910],
        [-1.2313, -0.7583, -1.4287]], grad_fn=<LogSoftmaxBackward>)
torch.return_types.max(
values=tensor([-0.7653, -0.7555, -0.9892, -0.9252, -0.7583], grad_fn=<MaxBackward0>),
indices=tensor([1, 1, 1, 1, 1]))

test_inputs = prepare_sequence(testing_data[0], word_to_ix)
tag_scores01 = model(test_inputs)
print(testing_data[0])
print(test_inputs)
print(tag_scores01)
print(torch.max(tag_scores01,1))

['They', 'ate', 'the', 'fish']
tensor([5, 2, 3, 4])
tensor([[-1.3469, -0.5845, -1.7006],
        [-1.0754, -0.9459, -1.3075],
        [-1.1031, -0.9052, -1.3329],
        [-1.2269, -0.7529, -1.4447]], grad_fn=<LogSoftmaxBackward>)
torch.return_types.max(
values=tensor([-0.5845, -0.9459, -0.9052, -0.7529], grad_fn=<MaxBackward0>),
indices=tensor([1, 1, 1, 1]))

RNN 实现预测一句话的下一个词

'''
  code by Tae Hwan Jung(Jeff Jung) @graykode, modify by wmathor
'''
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data

dtype = torch.FloatTensor

sentences = [ "i like dog", "i love coffee", "i hate milk"]

word_list = " ".join(sentences).split()
vocab = list(set(word_list))
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}
n_class = len(vocab)

# TextRNN Parameter
batch_size = 2
n_step = 2 # number of cells(= number of Step)
n_hidden = 5 # number of hidden units in one cell

def make_data(sentences):
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split()
        input = [word2idx[n] for n in word[:-1]]
        target = word2idx[word[-1]]

        input_batch.append(np.eye(n_class)[input])
        target_batch.append(target)

    return input_batch, target_batch

input_batch, target_batch = make_data(sentences)
input_batch, target_batch = torch.Tensor(input_batch), torch.LongTensor(target_batch)
dataset = Data.TensorDataset(input_batch, target_batch)
loader = Data.DataLoader(dataset, batch_size, True)

class TextRNN(nn.Module):
    def __init__(self):
        super(TextRNN, self).__init__()
        self.rnn = nn.RNN(input_size=n_class, hidden_size=n_hidden)
        # fc
        self.fc = nn.Linear(n_hidden, n_class)

    def forward(self, hidden, X):
        # X: [batch_size, n_step, n_class]
        X = X.transpose(0, 1) # X : [n_step, batch_size, n_class]
        out, hidden = self.rnn(X, hidden)
        # out : [n_step, batch_size, num_directions(=1) * n_hidden]
        # hidden : [num_layers(=1) * num_directions(=1), batch_size, n_hidden]
        out = out[-1] # [batch_size, num_directions(=1) * n_hidden] ⭐
        model = self.fc(out)
        return model

model = TextRNN()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training
for epoch in range(500):
    for x, y in loader:
      # hidden : [num_layers * num_directions, batch, hidden_size]
      hidden = torch.zeros(1, x.shape[0], n_hidden)
      # x : [batch_size, n_step, n_class]
      pred = model(hidden, x)

      # pred : [batch_size, n_class], y : [batch_size] (LongTensor, not one-hot)
      loss = criterion(pred, y)
      if (epoch + 1) % 100 == 0:
          print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
  
input = [sen.split()[:2] for sen in sentences]
# Predict
hidden = torch.zeros(1, len(input), n_hidden)
predict = model(hidden, input_batch).data.max(1, keepdim=True)[1]
print([sen.split()[:2] for sen in sentences], '->', [idx2word[n.item()] for n in predict.squeeze()])

Epoch: 0100 cost = 1.591890
Epoch: 0100 cost = 1.145270
Epoch: 0200 cost = 1.079182
Epoch: 0200 cost = 0.912082
Epoch: 0300 cost = 0.791074
Epoch: 0300 cost = 0.866361
Epoch: 0400 cost = 0.604370
Epoch: 0400 cost = 0.703954
Epoch: 0500 cost = 0.458180
Epoch: 0500 cost = 0.529244
[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']