Step 1: 下载数据集
首先,我们需要下载一个开放的语言模型数据集,例如 Penn Treebank (PTB) 或 WikiText-2。这些数据集包含了大量的文本数据用于训练语言模型。
我们可以使用以下命令来下载 PTB 数据集:
```
!wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/penn/train.txt
!wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/penn/valid.txt
!wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/penn/test.txt
```
Step 2: 数据预处理
接下来,我们需要对数据进行预处理。首先,我们需要将文本数据转化为数字形式,以便于模型使用。我们可以使用 PyTorch 内置的 Tokenizer 来将每个单词转化为一个唯一的数字 ID。
```
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from torchtext import data
from torchtext import datasets
TEXT = data.Field(lower=True, batch_first=True, fix_length=500)
train, val, test = datasets.PennTreebank.splits(TEXT)
TEXT.build_vocab(train, vectors="glove.6B.100d")
```
这里我们使用了一个预训练的词向量(glove.6B.100d),以便于在训练时使用。接下来,我们需要将数据集转化为 PyTorch 中的 Dataset 和 DataLoader,以便于模型训练时使用。
```
train_iter, val_iter, test_iter = data.BPTTIterator.splits(
(train, val, test), batch_size=32, bptt_len=35, device=device, repeat=False
)
```
Step 3: 定义模型
我们使用一个简单的循环神经网络模型来进行语言建模。这里我们使用了一个双层的 LSTM 模型。
```
class RNNModel(nn.Module):
def __init__(
self,
ntoken,
ninp,
nhid,
nlayers,
dropout=0.5,
tie_weights=False,
):
super(RNNModel, self).__init__()
self.drop = nn.Dropout(dropout)
self.encoder = nn.Embedding(ntoken, ninp)
self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
self.decoder = nn.Linear(nhid, ntoken)
self.init_weights()
self.nhid = nhid
self.nlayers = nlayers
self.ntoken = ntoken
if tie_weights:
self.decoder.weight = self.encoder.weight
def init_weights(self):
initrange = 0.1
self.encoder.weight.data.uniform_(-initrange, initrange)
self.decoder.bias.data.zero_()
self.decoder.weight.data.uniform_(-initrange, initrange)
def forward(self, input, hidden):
emb = self.drop(self.encoder(input))
output, hidden = self.rnn(emb, hidden)
output = self.drop(output)
decoded = self.decoder(output.view(output.size(0) * output.size(1), output.size(2)))
return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden
def init_hidden(self, bsz):
weight = next(self.parameters())
return (
weight.new_zeros(self.nlayers, bsz, self.nhid),
weight.new_zeros(self.nlayers, bsz, self.nhid),
)
```
Step 4: 训练模型
接下来,我们可以开始训练模型。我们以开发集困惑度停止下降为训练终止条件。我们训练模型的代码如下:
```
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
ntokens = len(TEXT.vocab.stoi)
emsize = 100
nhid = 256
nlayers = 2
dropout = 0.5
model = RNNModel(ntokens, emsize, nhid, nlayers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
lr = 20.0
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.5)
best_val_loss = float("inf")
epochs = 50
best_model = None
for epoch in range(1, epochs + 1):
train_loss = 0.0
val_loss = 0.0
hidden = model.init_hidden(32)
model.train()
for batch, data in enumerate(train_iter):
inputs, targets = data.text, data.target
inputs = inputs.to(device)
targets = targets.view(-1).to(device)
hidden = tuple(h.data for h in hidden)
model.zero_grad()
output, hidden = model(inputs, hidden)
loss = criterion(output.view(-1, ntokens), targets)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25)
optimizer.step()
train_loss += loss.item()
model.eval()
hidden = model.init_hidden(32)
with torch.no_grad():
for batch, data in enumerate(val_iter):
inputs, targets = data.text, data.target
inputs = inputs.to(device)
targets = targets.view(-1).to(device)
hidden = tuple(h.data for h in hidden)
output, hidden = model(inputs, hidden)
loss = criterion(output.view(-1, ntokens), targets)
val_loss += loss.item()
train_loss /= len(train_iter)
val_loss /= len(val_iter)
print(f"Epoch: {epoch}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = model
if scheduler is not None:
scheduler.step()
```
Step 5: 抽取词向量
训练结束后,我们可以从训练好的 RNN 模型中抽取出每个单词的词向量。这可以通过获取训练好的词嵌入层的权重来实现。
```
embeddings = best_model.encoder.weight.detach().cpu().numpy()
word2idx = TEXT.vocab.stoi
idx2word = TEXT.vocab.itos
```
Step 6: 分析词向量
最后,我们可以对抽取出的词向量进行分析,例如使用 k 近邻算法来找到与某个单词最接近的单词。
```
from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=5, metric="cosine")
knn.fit(embeddings)
def get_nearest_neighbors(word):
word_idx = word2idx[word]
word_embedding = embeddings[word_idx].reshape(1, -1)
distances, indices = knn.kneighbors(word_embedding)
neighbors = [idx2word[idx] for idx in indices[0]]
return neighbors
get_nearest_neighbors("cat")
```
这将返回与单词 "cat" 最接近的 5 个单词。