前馈神经网络语言模型(FFNNLM)实现获取词向量
构建词表
'''
@Filename :vocab.py
@Description :
@Datatime :2021/08/24 17:02:51
@Author :qtxu
@Version :v1.0
'''
from collections import defaultdict,Counter
class Vocab:
def __init__(self, tokens=None):
self.idx_to_token = list()
self.token_to_idx = dict()
if tokens is not None:
if "<unk>" not in tokens:
tokens = tokens +["<unk>"]
for token in tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
self.unk = self.token_to_idx['<unk>']
@classmethod
def build(cls, text, min_freq=1, reserved_tokens=None):
token_freqs = defaultdict(int)
for sentence in text:
for token in sentence:
token_freqs[token] += 1
uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
uniq_tokens += [token for token, freq in token_freqs.items() \
if freq >= min_freq and token != "<unk>"]
return cls(uniq_tokens)
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, token):
return self.token_to_idx.get(token, self.unk)
def convert_tokens_to_ids(self, tokens):
return [self[token] for token in tokens]
def convert_ids_to_tokens(self, indices):
return [self.idx_to_token[index] for index in indices]
def save_vocab(vocab, path):
with open(path, 'w') as writer:
writer.write("\n".join(vocab.idx_to_token))
def read_vocab(path):
with open(path, 'r') as f:
tokens = f.read().split('\n')
return Vocab(tokens)
数据准备
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from vocab import Vocab
from nltk.corpus import reuters
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
BOW_TOKEN = "<bow>"
EOW_TOKEN = "<eow>"
WEIGHT_INIT_RANGE = 0.1
def load_reuters():
text = reuters.sents()
text = [[word.lower() for word in sentence]
for sentence in text]
vocab = Vocab.build(text, reserved_tokens=[
PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
corpus = [vocab.convert_tokens_to_ids(
sentence) for sentence in text]
return corpus, vocab
def save_pretrained(vocab, embeds, save_path):
with open(save_path, "w") as writer:
writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
for idx, token in enumerate(vocab.idx_to_token):
vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
writer.write(f"{token} {vec}\n")
print(f"Pretrained embeddings saved to:{save_path}")
def load_pretrained(load_path):
with open(load_path, "r") as fin:
n, d = map(int, fin.readline().split())
tokens = []
embeds = []
for line in fin:
line = line.rstrip().split(' ')
token, embeds = line[0], list(map(float, line[1:]))
tokens.append(token)
embeds.append(embeds)
vocab = Vocab(tokens)
embeds = torch.tensor(embeds, dtype=torch.float)
return vocab, embeds
def get_loader(dataset, batch_size, shuffle=True):
data_loader = DataLoader(
dataset,
batch_size=batch_size,
collate_fn=dataset.collate_fn,
shuffle=shuffle
)
return data_loader
def init_weights(model):
for name, param in model.named_parameters():
if "embedding" not in name:
torch.nn.init.uniform_(
param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE)
模型实现
from vocab import Vocab
from tqdm.auto import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from utils import BOS_TOKEN, EOS_TOKEN, BOW_TOKEN, EOW_TOKEN
from torch.utils.data import DataLoader, Dataset, TensorDataset
from utils import load_reuters, load_pretrained, save_pretrained, get_loader, init_weights
class NGramDataset(Dataset):
def __init__(self, corpus, vocab, context_size=2):
self.data = []
self.bos = vocab[BOS_TOKEN]
self.eos = vocab[EOS_TOKEN]
for sentence in tqdm(corpus, desc="Dataset Construction"):
sentence = [self.bos] + sentence + [self.eos]
if len(sentence) < context_size:
continue
for i in range(context_size, len(sentence)):
context = sentence[i-context_size:i]
target = sentence[i]
self.data.append((context, target))
def __len__(self):
return len(self.data)
def __getitem__(self, i):
return self.data[i]
def collate_fn(self, examples):
inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
return (inputs, targets)
class FeedFordwardNNLM(nn.Module):
def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
super(FeedFordwardNNLM, self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
self.linear2 = nn.Linear(hidden_dim, vocab_size)
self.activate = F.relu
init_weights(self)
def forward(self, inputs):
embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
hidden = self.activate(self.linear1(embeds))
output = self.linear2(hidden)
log_probs = F.log_softmax(output, dim=1)
return log_probs
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1064
num_epoch = 10
corpus, vocab = load_reuters()
dataset = NGramDataset(corpus, vocab, context_size)
data_loader = get_loader(dataset, batch_size)
nll_loss = nn.NLLLoss()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = FeedFordwardNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
total_losses = []
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(data_loader, desc=f"Training Epoch{epoch}"):
inputs, targets = [x.to(device) for x in batch]
optimizer.zero_grad()
log_probs = model(inputs)
loss = nll_loss(log_probs, targets)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")
total_losses.append(total_loss)
save_pretrained(vocab, model.embeddings.weight.data, "5.1 ffnnlm.vec")