静态词向量之rnn训练词向量

介绍

前文介绍了许多方法来获取静态词向量,本文介绍使用lstm来训练词向量。

模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
class RNNLM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super(RNNLM, self).__init__()
# 词嵌入层
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
# 循环神经网络:这里使用LSTM
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
# 输出层
self.output = nn.Linear(hidden_dim, vocab_size)

def forward(self, inputs, lengths):
embeds = self.embeddings(inputs)
# 计算每一时刻的隐含层表示
x_pack = pack_padded_sequence(embeds, lengths, batch_first=True, enforce_sorted=False)
hidden, (hn, cn) = self.rnn(x_pack)
hidden, _ = pad_packed_sequence(hidden, batch_first=True)
output = self.output(hidden)
log_probs = F.log_softmax(output, dim=2)
return log_probs

数据处理

1
2
3
4
# 读取文本数据,构建FFNNLM训练数据集(n-grams)
corpus, vocab = load_reuters()
dataset = RnnlmDataset(corpus, vocab)
data_loader = get_loader(dataset, batch_size)

数据来自reuters,在dataset那里是将前面词预测后面一个词,可以具体看dataset的处理方式。

不过有一个点需要注意,我用了1080Ti竟然跑不起来,参数量太大了,所以我改动了load_reuters代码,将词出现频次低于5的就忽略掉。代码如下。

1
2
3
4
5
6
7
8
9
10
11
12
def load_reuters():
nltk.set_proxy('http://192.168.0.28:1080')
nltk.download('reuters')
nltk.download('punkt')
from nltk.corpus import reuters
text = reuters.sents()
# lowercase (optional)
text = [[word.lower() for word in sentence] for sentence in text]
vocab = Vocab.build(text, reserved_tokens=[PAD_TOKEN, BOS_TOKEN, EOS_TOKEN], min_freq=5)
corpus = [vocab.convert_tokens_to_ids(sentence) for sentence in text]

return corpus, vocab

完整代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Defined in Section 5.1.3.3

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights
def cal_similar(w):
v = model.embeddings.weight[vocab[w]]
values, indices = torch.mm(model.embeddings.weight, v.view(-1, 1)).topk(dim=0, k=3)
similar_tokens = vocab.convert_ids_to_tokens(indices.view(-1).tolist())
return similar_tokens


def demos():
tokens = ['china', 'august', 'good', 'paris']
for token in tokens:
s = cal_similar(token)
print(f'{token}: {s}')

class RnnlmDataset(Dataset):
def __init__(self, corpus, vocab):
self.data = []
self.bos = vocab[BOS_TOKEN]
self.eos = vocab[EOS_TOKEN]
self.pad = vocab[PAD_TOKEN]
for sentence in tqdm(corpus, desc="Dataset Construction"):
# 模型输入:BOS_TOKEN, w_1, w_2, ..., w_n
input = [self.bos] + sentence
# 模型输出:w_1, w_2, ..., w_n, EOS_TOKEN
target = sentence + [self.eos]
self.data.append((input, target))

def __len__(self):
return len(self.data)

def __getitem__(self, i):
return self.data[i]

def collate_fn(self, examples):
# 从独立样本集合中构建batch输入输出
inputs = [torch.tensor(ex[0]) for ex in examples]
targets = [torch.tensor(ex[1]) for ex in examples]
lengths = [i.size(0) for i in inputs]
# 对batch内的样本进行padding,使其具有相同长度
inputs = pad_sequence(inputs, batch_first=True, padding_value=self.pad)
targets = pad_sequence(targets, batch_first=True, padding_value=self.pad)
return inputs, targets, lengths


class RNNLM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim):
super(RNNLM, self).__init__()
# 词嵌入层
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
# 循环神经网络:这里使用LSTM
self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
# 输出层
self.output = nn.Linear(hidden_dim, vocab_size)

def forward(self, inputs, lengths):
embeds = self.embeddings(inputs)
# 计算每一时刻的隐含层表示
x_pack = pack_padded_sequence(embeds, lengths, batch_first=True, enforce_sorted=False)
hidden, (hn, cn) = self.rnn(x_pack)
hidden, _ = pad_packed_sequence(hidden, batch_first=True)
output = self.output(hidden)
log_probs = F.log_softmax(output, dim=2)
return log_probs


embedding_dim = 128

hidden_dim = 128
batch_size = 32
num_epoch = 10

# 读取文本数据,构建FFNNLM训练数据集(n-grams)
corpus, vocab = load_reuters()
dataset = RnnlmDataset(corpus, vocab)
data_loader = get_loader(dataset, batch_size)

# 负对数似然损失函数,忽略pad_token处的损失
nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
# 构建RNNLM,并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNLM(len(vocab), embedding_dim, hidden_dim)
para_model = nn.DataParallel(model)
model.to(device)
print(model)

# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
total_loss = 0
bar = tqdm(data_loader, desc=f"Training Epoch {epoch}")
for batch in bar:
inputs, targets = [x.to(device) for x in batch[:2]]
lengths = batch[2]
optimizer.zero_grad()
log_probs = model(inputs, lengths)
loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]), targets.view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()
bar.set_postfix_str(f'loss:{loss.item()}')
demos()
print(f"Loss: {total_loss:.2f}")

save_pretrained(vocab, model.embeddings.weight.data, "rnnlm.vec")
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值