NLP学习2——Word2Vec之SkipGram模型的Python实现

本篇文章参考自该教程
我在Google的Colaboratory上面使用了Google云端的GPU运行了该程序

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import nltk
import random
import numpy as np
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]  # 将多层列表平铺成一层列表的方法
	# 其余平铺列表的方法可以参考该教程	
	# (https://blog.csdn.net/weixin_40539892/article/details/79103290)
random.seed(1024)  # 固定随机值
torch.cuda.get_device_name(0)  # 获得GPU的型号,'Tesla P100-PCIE-16GB'

# 加载数据集
nltk.download('gutenberg')
nltk.corpus.gutenberg.fileids()
nltk.download('punkt')
corpus = list(nltk.corpus.gutenberg.sents('melville-moby_dick.txt'))[:100]
corpus = [[word.lower() for word in sent] for sent in corpus]  # 将数据集中的字母转换为小写
corpus[0:2]  # 查看数据集
	# [['[', 'moby', 'dick', 'by', 'herman', 'melville', '1851', ']'],
 	  # ['etymology', '.']]
# 原程序介绍了一种构造stopwords的方法,我没有沿用原程序的方法,但是原程序的思想和用到的方法可以学习一下:
# word_count = Counter(flatten(corpus))  # 构建词汇表,并对每个词和标点符号计数,输出形式:'a': 21
# border = int(len(word_count) * 0.01)
# stopwords = word_count.most_common()[:border] + list(reversed(word_count.most_common()))[:border]  most_common()方法可以对词的出现次数由多到少统计
# stopwords = [s[0] for s in stopwords] 
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
vocab = list(set(flatten(corpus)) - set(stop_words)) # 去除stopwords,构建词汇表
vocab.append('<UNK>')
print(len(set(flatten(corpus))), len(vocab))  # 592 514		

word2index = {'<UNK>' : 0} 
for vo in vocab:    
	if word2index.get(vo) is None:        
		word2index[vo] = len(word2index)  # 构建词汇表的索引,输出形式:{'<UNK>': 0, 'history': 1, 'patient': 2, 'side': 3...} 

index2word = {v:k for k, v in word2index.items()}  # 形式:{0: '<UNK>', 1: 'history', 2: 'patient', 3: 'side'...}
# 设置窗口大小
WINDOW_SIZE = 3
# 一个序列的长度是7,其中c是中心词,遍历corpus的每个词并将其当作中心词
windows = flatten([list(nltk.ngrams(['<DUMMY>'] * WINDOW_SIZE + c + ['<DUMMY>'] * WINDOW_SIZE, WINDOW_SIZE * 2 + 1)) for c in corpus])
# 构造训练集
train_data = []
for window in windows:  # 遍历每个序列,提取中心词和其相邻的词
    for i in range(WINDOW_SIZE * 2 + 1):
        if i == WINDOW_SIZE or window[i] == '<DUMMY>': 
            continue
        train_data.append((window[WINDOW_SIZE], window[i]))
print(train_data[:WINDOW_SIZE * 2])	 # [('[', 'moby'), ('[', 'dick'), ('[', 'by'), ('moby', '['), ('moby', 'dick'), ('moby', 'by')]
# 训练集是一个列表,然后将中心词和上下文词打包成元组存入列表

def prepare_word(word, word2index):  # 定义函数将词汇表中词的索引数值构造一个可进行反向传播的属性
	return Variable(LongTensor([word2index[word]]) if word2index.get(word) is not None else LongTensor([word2index["<UNK>"]]))    
X_p = []
y_p = []    
for tr in train_data:
    X_p.append(prepare_word(tr[0], word2index).view(1, -1))  # 分离中心词和标签,并将其升至二维
    y_p.append(prepare_word(tr[1], word2index).view(1, -1))
train_data = list(zip(X_p, y_p))  # 重新打包数据集,此时数据集存储的是数值
len(train_data)  # 7606

# 下面定义模型
class Skipgram(nn.Module):
    
    def __init__(self, vocab_size, projection_dim):
        super(Skipgram,self).__init__()
        self.embedding_v = nn.Embedding(vocab_size, projection_dim)
        self.embedding_u = nn.Embedding(vocab_size, projection_dim)

        self.embedding_v.weight.data.uniform_(-1, 1) # 初始化输入向量权重
        self.embedding_u.weight.data.uniform_(0, 0) # 初始化输出向量权重,这里不明白为什么初始为0?      
    def forward(self, center_words,target_words, outer_words):
        center_embeds = self.embedding_v(center_words) # batch_size x 1 x n,n是嵌入维度
        target_embeds = self.embedding_u(target_words) # batch_size x 1 x n
        outer_embeds = self.embedding_u(outer_words) # batch_size x V x n ,V是词汇表的大小
        
        scores = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # batch_size x 1 x n * batch_size x n x 1 => batch_size x 1  bmm()方法是批量矩阵乘法
        norm_scores = outer_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # batch_size x V
        # 定义损失函数
        nll = -torch.mean(torch.log(torch.exp(scores)/torch.sum(torch.exp(norm_scores), 1).unsqueeze(1))) # log-softmax
        
        return nll # negative log likelihood
    
    def prediction(self, inputs):
        embeds = self.embedding_v(inputs)
        
        return embeds 
# 开始训练    
EMBEDDING_SIZE = 30
BATCH_SIZE = 256
EPOCH = 100
model = Skipgram(len(word2index), EMBEDDING_SIZE)
if USE_CUDA:
    model = model.cuda()
optimizer = optim.Adam(model.parameters(), lr=0.01)
# 定义批量处理函数    
def getBatch(batch_size, train_data):
    random.shuffle(train_data)
    sindex = 0
    eindex = batch_size
    while eindex < len(train_data):
        batch = train_data[sindex: eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= len(train_data):
        batch = train_data[sindex:]
        yield batch
# 我想用torch自带的DataLoader函数处理批量数据,
# 但经历了一系列的数据类型转换、数据维度的改变等等处理之后,还是失败了,所以这里还是用的原程序作者自己写的批量处理函数    
def prepare_sequence(seq, word2index):  # 该函数的作用是把整个词汇表的索引数值加入一个长列表,并赋予可反向传播的属性
    idxs = list(map(lambda w: word2index[w] if word2index.get(w) is not None else word2index["<UNK>"], seq))
    return Variable(LongTensor(idxs))
            
for epoch in range(EPOCH):
    for i, batch in enumerate(getBatch(BATCH_SIZE, train_data)):
        
        inputs, targets = zip(*batch)
        
        inputs = torch.cat(inputs) # batch_size x 1
        targets = torch.cat(targets) # batch_size x 1
        vocabs = prepare_sequence(list(vocab), word2index).expand(inputs.size(0), len(vocab))  # batch_size x V
        model.zero_grad()
        loss = model(inputs, targets, vocabs)        
        loss.backward()
        optimizer.step()        
    if epoch % 10 == 0:
        print("Epoch : %d, mean_loss : %.02f" % (epoch,loss))

# 训练完成之后,我们对模型进行测试,测试用的方法是测量两个词之间的余弦相似度    
def word_similarity(target, vocab):
    if USE_CUDA:
        target_V = model.prediction(prepare_word(target, word2index))
    else:
        target_V = model.prediction(prepare_word(target, word2index))
    similarities = []
    for i in range(len(vocab)):
        if vocab[i] == target: continue
        
        if USE_CUDA:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        else:
            vector = model.prediction(prepare_word(list(vocab)[i], word2index))
        cosine_sim = F.cosine_similarity(target_V, vector).data.tolist()[0] 
        similarities.append([vocab[i], cosine_sim])
    return sorted(similarities, key=lambda x: x[1], reverse=True)[:10] # 对相似度从大到小排序    

test = random.choice(list(vocab))
word_similarity(test, vocab)                        	

在这里插入图片描述
在这里插入图片描述

  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值