文章目录
简介
在2013年Tomas Mikolov等人提出word2vec之后,2014年,Jeffrey Pennington, Richard Socher, Christopher D. Manning三人提出了GloVe算法。其中,GloVe是Global Vector的缩写。在传统上,实现word embedding(词嵌入)主要有两种方法,Matrix Factorization Methods(矩阵分解方法、例如LSA)和Shallow Window-Based Methods(基于浅窗口的方法,例如word2vec),二者分别有优缺点,而GloVe结合了两者之间的优点。从论文中的实验,可以看到GloVe方法好于word2vec等方法。
Glove算法是一种基于全局词频统计的回归算法。它不是基于神经网络的,而是基于最小二乘原理的回归方法。
Glove论文:Jeffrey Pennington, Richard Socher, and Christopher Manning. 2014. Glove: Global Vectors for Word Representation. In Proceedings of the Conference on Empirical Methods in Natural Language Processing. ACL, 1532–1543.
代码:来自论文2018-MM-Cross-modal Moment Localization in Videos的代码https://acmmm18.wixsite.com/role
在2018-ICCV-Grounding Referring Expressions in Images by Variational Context的代码https://github.com/yuleiniu/vc/中首次被用到
使用
若直接使用vocabulary_72700.txt和embed_matrix.npy,注意转换前将 单词转为小写,否则可能转换失败
功能:将 多个句子list(str) 转为 300d float32词嵌入,支持72700个英语单词(不含pad、go、eos、unk)
if T >= 0:
tensor shape[句子数,T, 300]
else:
list(tensor) shape[句子数,该句单词数, 300]
项目地址:
链接:https://pan.baidu.com/s/1bmcrRCeQy7vNbxW-f2E1sg?pwd=7dl9
提取码:7dl9
"""
vocabulary_72700.txt 72704个单词,开始4个为
<pad> padding
<go> start of sequence
<eos> end of sequence
<unk> unknown words
"""
import io
import numpy as np
import re
import torch
def glove_preprocess(sentence, vocab_dict, T=-1, padding_at_first=False):
"""
分词 转小写 去除标点?.(,可以被编码) 编码(转int) padding至指定长度T
:param sentence: 句子 str
:param vocab_dict: 编码字典 str:int
:param T: 指定长度,-1表示不padding
:param padding_at_first: 当T>=0时,padding_at_first=False表示在后padding,True表示在前padding
:return: list(int)
if T < 0: [sentence单词数] else: [T]
"""
# if bytes then to str
if isinstance(sentence, bytes):
sentence = sentence.decode()
# 分词 转小写 去除标点?.(,可以被编码)
words = re.compile(r'(\W+)').split(sentence.strip())
words = [w.lower() for w in words if len(w.strip()) > 0]
if len(words) > 0 and (words[-1] == '.' or words[-1] == '?'):
words = words[:-1]
# 编码(str->int)
vocab_indices = [(vocab_dict[w] if w in vocab_dict else vocab_dict['<unk>']) for w in words]
if T >= 0:
if len(vocab_indices) > T:
vocab_indices = vocab_indices[:T]
elif len(vocab_indices) < T:
if padding_at_first:
vocab_indices = [vocab_dict['<pad>']] * (T - len(vocab_indices)) + vocab_indices
else:
vocab_indices = vocab_indices + [vocab_dict['<pad>']] * (T - len(vocab_indices))
return vocab_indices
def glove(sentences, T=-1, padding_at_first=False):
"""
将 多个句子list(str) 转为 300d float32词嵌入
:param sentences: str 或 list(str)
:param T: 指定长度,-1表示不padding
:param padding_at_first: 当T>=0时,padding_at_first=False表示在后padding,True表示在前padding
:return: dtype=float32
if T >= 0:
tensor shape[句子数,T, 300]
else:
list(tensor) shape[句子数,该句单词数, 300]
"""
if isinstance(sentences, str):
sentences = [sentences]
# 加载编码字典 str -> int
with io.open('vocabulary_72700.txt', encoding='utf-8') as f:
words = [w.strip() for w in f.readlines()]
vocab_dict = {words[n]: n for n in range(len(words))}
# 加载embedding表 int-> tensor
wordembed_params = 'embed_matrix.npy'
embedding_mat = np.load(wordembed_params)
result = []
for sentence in sentences:
sent_emb = []
vocab_indices = glove_preprocess(sentence, vocab_dict, T, padding_at_first)
for item in vocab_indices:
sent_emb.append(embedding_mat[item])
result.append(torch.tensor(np.array(sent_emb)))
if T >= 0:
return torch.stack(result)
else:
return result
if __name__ == '__main__':
test = ['Person sets large mug on counter.', 'The person disposes of the egg shell into the wastebin.']
with_T = glove(test, 10, padding_at_first=True)
without_T = glove(test)
print(with_T)
print(without_T)
使用torchtext
import torch
import torchtext
from torch import nn
# 400000词,添加<unk> 位置400000 全0 300d,400001词
# cache参数可选,词向量文件有:
# charngram.100d fasttext.en.300d fasttext.simple.300d glove.42B.300d glove.840B.300d
# glove.twitter.27B.25d glove.twitter.27B.50d glove.twitter.27B.100d glove.twitter.27B.200d
# glove.6B.50d glove.6B.100d glove.6B.200d glove.6B.300d
vocab = torchtext.vocab.pretrained_aliases["glove.6B.300d"](cache='../.vector_cache')
vocab.itos.extend(['<unk>'])
vocab.stoi['<unk>'] = vocab.vectors.shape[0]
vocab.vectors = torch.cat([vocab.vectors, torch.zeros(1, vocab.dim)], dim=0)
word_embedding = nn.Embedding.from_pretrained(vocab.vectors)
def fun(sentence):
word_idxs = torch.tensor([vocab.stoi.get(w.lower(), 400000) for w in sentence.split()], dtype=torch.long)
word_vectors = word_embedding(word_idxs)
return word_idxs, word_vectors
sentence = 'I am a student.'
word_idxs, word_vectors = fun(sentence)
print(word_idxs)
print(word_vectors)