本篇文章参考自该教程
我在Google的Colaboratory上面使用了Google云端的GPU运行了该程序
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist] # 将多层列表平铺成一层列表的方法
# 其余平铺列表的方法可以参考该教程
# (https://blog.csdn.net/weixin_40539892/article/details/79103290)
random.seed(1024) # 固定随机值
torch.cuda.get_device_name(0) # 获得GPU的型号,'Tesla P100-PCIE-16GB'
# 加载数据集
nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()
nltk.download('punkt')
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100]
corpus = [[word.lower() for word in sent] for sent in corpus] # 将数据集中的字母转换为小写
corpus[0:2] # 查看数据集
# [['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'],
# ['etymology', '.']]
# 原程序介绍了一种构造stopwords的方法,我没有沿用原程序的方法,但是原程序的思想和用到的方法可以学习一下:
# word_count = Counter(flatten(corpus)) # 构建词汇表,并对每个词和标点符号计数,输出形式:'a': 21
# border = int(len(word_count) * 0.01)
# stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border] most_common()方法可以对词的出现次数由多到少统计
# stopwords = [s[0] for s in stopwords]
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
vocab = list(set(flatten(corpus)) - set(stop_words)) # 去除stopwords,构建词汇表
vocab.append('<UNK>')
print(len(set(flatten(corpus))), len(vocab)) # 592 514
word2index = {'<UNK>' : 0}
for vo in vocab:
if word2index.get(vo) is None:
word2index[vo] = len(word2index) # 构建词汇表的索引,输出形式:{'<UNK>': 0, 'history': 1, 'patient': 2, 'side': 3...}
index2word = {v:k for k, v in word2index.items()} # 形式:{0: '<UNK>', 1: 'history', 2: 'patient', 3: 'side'...}
# 设置窗口大小
WINDOW_SIZE = 3
# 一个序列的长度是7,其中c是中心词,遍历corpus的每个词并将其当作中心词
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
# 构造训练集
train_data = []
for window in windows: # 遍历每个序列,提取中心词和其相邻的词
for i in range(WINDOW_SIZE * 2 + 1):
if i == WINDOW_SIZE or window[i] == '<DUMMY>':
continue
train_data.append((window[WINDOW_SIZE], window[i]))
print(train_data[:WINDOW_SIZE * 2]) # [('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]
# 训练集是一个列表,然后将中心词和上下文词打包成元组存入列表
def prepare_word(word, word2index): # 定义函数将词汇表中词的索引数值构造一个可进行反向传播的属性
return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))
X_p = []
y_p = []
for tr in train_data:
X_p.append(prepare_word(tr[0], word2index).view(1, -1)) # 分离中心词和标签,并将其升至二维
y_p.append(prepare_word(tr[1], word2index).view(1, -1))
train_data = list(zip(X_p, y_p)) # 重新打包数据集,此时数据集存储的是数值
len(train_data) # 7606
# 下面定义模型
class Skipgram(nn.Module):
def __init__(self, vocab_size, projection_dim):
super(Skipgram,self).__init__()
self.embedding_v = nn.Embedding(vocab_size, projection_dim)
self.embedding_u = nn.Embedding(vocab_size, projection_dim)
self.embedding_v.weight.data.uniform_(-1, 1) # 初始化输入向量权重
self.embedding_u.weight.data.uniform_(0, 0) # 初始化输出向量权重,这里不明白为什么初始为0?
def forward(self, center_words,target_words, outer_words):
center_embeds = self.embedding_v(center_words) # batch_size x 1 x n,n是嵌入维度
target_embeds = self.embedding_u(target_words) # batch_size x 1 x n
outer_embeds = self.embedding_u(outer_words) # batch_size x V x n ,V是词汇表的大小
scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # batch_size x 1 x n * batch_size x n x 1 => batch_size x 1 bmm()方法是批量矩阵乘法
norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # batch_size x V
# 定义损失函数
nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
return nll # negative log likelihood
def prediction(self, inputs):
embeds = self.embedding_v(inputs)
return embeds
# 开始训练
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100
model = Skipgram(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)
# 定义批量处理函数
def getBatch(batch_size, train_data):
random.shuffle(train_data)
sindex = 0
eindex = batch_size
while eindex < len(train_data):
batch = train_data[sindex: eindex]
temp = eindex
eindex = eindex + batch_size
sindex = temp
yield batch
if eindex >= len(train_data):
batch = train_data[sindex:]
yield batch
# 我想用torch自带的DataLoader函数处理批量数据,
# 但经历了一系列的数据类型转换、数据维度的改变等等处理之后,还是失败了,所以这里还是用的原程序作者自己写的批量处理函数
def prepare_sequence(seq, word2index): # 该函数的作用是把整个词汇表的索引数值加入一个长列表,并赋予可反向传播的属性
idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
return Variable(LongTensor(idxs))
for epoch in range(EPOCH):
for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
inputs, targets = zip(*batch)
inputs = torch.cat(inputs) # batch_size x 1
targets = torch.cat(targets) # batch_size x 1
vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab)) # batch_size x V
model.zero_grad()
loss = model(inputs, targets, vocabs)
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print("Epoch : %d, mean_loss : %.02f" % (epoch,loss))
# 训练完成之后,我们对模型进行测试,测试用的方法是测量两个词之间的余弦相似度
def word_similarity(target, vocab):
if USE_CUDA:
target_V = model.prediction(prepare_word(target, word2index))
else:
target_V = model.prediction(prepare_word(target, word2index))
similarities = []
for i in range(len(vocab)):
if vocab[i] == target: continue
if USE_CUDA:
vector = model.prediction(prepare_word(list(vocab)[i], word2index))
else:
vector = model.prediction(prepare_word(list(vocab)[i], word2index))
cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0]
similarities.append([vocab[i], cosine_sim])
return sorted(similarities, key=lambda x: x[1], reverse=True)[:10] # 对相似度从大到小排序
test = random.choice(list(vocab))
word_similarity(test, vocab)