1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torch.utils.data import Dataset from tqdm.auto import tqdm from utils import BOS_TOKEN, EOS_TOKEN from utils import load_reuters, save_pretrained, get_loader, init_weights from torch.optim.lr_scheduler import ExponentialLR def cal_similar(w): v = model.embeddings.weight[vocab[w]] values, indices = torch.mm(model.embeddings.weight, v.view(-1, 1)).topk(dim=0, k=3) similar_tokens = vocab.convert_ids_to_tokens(indices.view(-1).tolist()) return similar_tokens
def demos(): tokens = ['china', 'august', 'good', 'paris'] for token in tokens: s = cal_similar(token) print(f'{token}: {s}')
class NGramDataset(Dataset): def __init__(self, corpus, vocab, context_size=2): self.data = [] self.bos = vocab[BOS_TOKEN] self.eos = vocab[EOS_TOKEN] for sentence in tqdm(corpus, desc="Dataset Construction"): sentence = [self.bos] + sentence + [self.eos] if len(sentence) < context_size: continue for i in range(context_size, len(sentence) - context_size): left_context = sentence[i-context_size:i] right_context = sentence[i+1: i+context_size + 1] context = [*left_context, *right_context] target = sentence[i] self.data.append((context, target))
def __len__(self): return len(self.data)
def __getitem__(self, i): return self.data[i]
def collate_fn(self, examples): inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long) targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long) return (inputs, targets)
class FeedForwardNNLM(nn.Module): def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim): super(FeedForwardNNLM, self).__init__() self.embeddings = nn.Embedding(vocab_size, embedding_dim) self.linear1 = nn.Linear(context_size * embedding_dim * 2, hidden_dim) self.linear2 = nn.Linear(hidden_dim, vocab_size) self.activate = F.relu self.dp = nn.Dropout(0.1)
def forward(self, inputs): embeds = self.embeddings(inputs).view((inputs.shape[0], -1)) hidden = self.activate(self.linear1(embeds)) output = self.linear2(hidden) log_probs = F.log_softmax(output, dim=1) return log_probs
embedding_dim = 64 context_size = 2 hidden_dim = 128 batch_size = 10240 num_epoch = 10
corpus, vocab = load_reuters() dataset = NGramDataset(corpus, vocab, context_size) data_loader = get_loader(dataset, batch_size)
nll_loss = nn.NLLLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = FeedForwardNNLM(len(vocab), embedding_dim, context_size, hidden_dim) model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01) scheduler = ExponentialLR(optimizer, gamma=0.9) model.train() total_losses = [] for epoch in range(num_epoch): total_loss = 0 for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"): inputs, targets = [x.to(device) for x in batch] optimizer.zero_grad() log_probs = model(inputs) loss = nll_loss(log_probs, targets) loss.backward() optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}, LR: {scheduler.get_last_lr()[0]}") scheduler.step() demos() total_losses.append(total_loss)
save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")
|