1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
|
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import Dataset from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence from tqdm.auto import tqdm from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN from utils import load_reuters, save_pretrained, get_loader, init_weights def cal_similar(w): v = model.embeddings.weight[vocab[w]] values, indices = torch.mm(model.embeddings.weight, v.view(-1, 1)).topk(dim=0, k=3) similar_tokens = vocab.convert_ids_to_tokens(indices.view(-1).tolist()) return similar_tokens
def demos(): tokens = ['china', 'august', 'good', 'paris'] for token in tokens: s = cal_similar(token) print(f'{token}: {s}')
class RnnlmDataset(Dataset): def __init__(self, corpus, vocab): self.data = [] self.bos = vocab[BOS_TOKEN] self.eos = vocab[EOS_TOKEN] self.pad = vocab[PAD_TOKEN] for sentence in tqdm(corpus, desc="Dataset Construction"): input = [self.bos] + sentence target = sentence + [self.eos] self.data.append((input, target))
def __len__(self): return len(self.data)
def __getitem__(self, i): return self.data[i]
def collate_fn(self, examples): inputs = [torch.tensor(ex[0]) for ex in examples] targets = [torch.tensor(ex[1]) for ex in examples] lengths = [i.size(0) for i in inputs] inputs = pad_sequence(inputs, batch_first=True, padding_value=self.pad) targets = pad_sequence(targets, batch_first=True, padding_value=self.pad) return inputs, targets, lengths
class RNNLM(nn.Module): def __init__(self, vocab_size, embedding_dim, hidden_dim): super(RNNLM, self).__init__() self.embeddings = nn.Embedding(vocab_size, embedding_dim) self.rnn = nn.LSTM(embedding_dim, hidden_dim, batch_first=True) self.output = nn.Linear(hidden_dim, vocab_size)
def forward(self, inputs, lengths): embeds = self.embeddings(inputs) x_pack = pack_padded_sequence(embeds, lengths, batch_first=True, enforce_sorted=False) hidden, (hn, cn) = self.rnn(x_pack) hidden, _ = pad_packed_sequence(hidden, batch_first=True) output = self.output(hidden) log_probs = F.log_softmax(output, dim=2) return log_probs
embedding_dim = 128
hidden_dim = 128 batch_size = 32 num_epoch = 10
corpus, vocab = load_reuters() dataset = RnnlmDataset(corpus, vocab) data_loader = get_loader(dataset, batch_size)
nll_loss = nn.NLLLoss(ignore_index=dataset.pad)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = RNNLM(len(vocab), embedding_dim, hidden_dim) para_model = nn.DataParallel(model) model.to(device) print(model)
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train() for epoch in range(num_epoch): total_loss = 0 bar = tqdm(data_loader, desc=f"Training Epoch {epoch}") for batch in bar: inputs, targets = [x.to(device) for x in batch[:2]] lengths = batch[2] optimizer.zero_grad() log_probs = model(inputs, lengths) loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]), targets.view(-1)) loss.backward() optimizer.step() total_loss += loss.item() bar.set_postfix_str(f'loss:{loss.item()}') demos() print(f"Loss: {total_loss:.2f}")
save_pretrained(vocab, model.embeddings.weight.data, "rnnlm.vec")
|