在NLP任务中常常需要预加载的词向量,其中gensim 提供了现成的api供大家使用,但里面往往有些需求无法满足,例如未知词的处理。针对这些情况,自己根据gensim 的源码,进行了重构。本篇文章主要包含两部分内容:1.gensim 原始的调用方式;自己修改过的调用方式,本文词向量以腾讯提供的 Tencent_AILab_ChineseEmbedding.txt 为demo
gensim 加载预训练词向量
emb_file = 'Tencent_AILab_ChineseEmbedding.txt'
#加载预处理词向量
word_vectors = KeyedVectors.load_word2vec_format(emb_file, binary=False, encoding="utf-8", unicode_errors="ignore")
# 获得词典,即word 到id 的映射
vocab = word_vectors.vocab
# 获得id到 word 的映射
index2word = word_vectors.indexs2word
# 获得keras 对应的embedding 层
embedding_lay = word_vectors.get_keras_embedding(train_embeddings=false)
这里面主要存在的问题就是:预训练的词向量没有添加对未知词的处理,导致kears调用的时候存在缺陷
修改后的预训练词向量加载的方式
主要添加了对未知词的处理,能够直接放到keras里面生成embedding_layer
import logging
import codecs
from collections import defaultdict
from gensim.models import KeyedVectors
import numpy as np
class WordEmbeddingKeyedVectors:
def __init__(self):
self.vocab = defaultdict(int)
self.index2word = []
def load_word2vec_format(self,fname,limit =None,num_trainable_tokens=3):
"""
:param fname:
:param limit:
:param num_trainable_tokens: 词典文件前3个词标记为变量,默认为"<unk>","<s>","</s>"
:return:
"""
logger.info("loading projection weights from %s", fname)
with codecs.open(fname,'r','utf8') as fr:
header = fr.readline()
# 读取文件的词语个数 和向量维度
vocab_size, vector_size = (int(x) for x in header.split())
# 修改单词的总数 +3
vocab_size = vocab_size + num_trainable_tokens
if limit:
vocab_size = min(vocab_size, limit)
self.vectors = np.zeros((vocab_size,vector_size),dtype=np.float32)
def add_word(word,vec):
word_id = len(self.vocab)
if word in self.vocab:
logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname)
return
self.vocab[word] = word_id
self.vectors[word_id] = vec
self.index2word.append(word)
# 随机初始化填充 "<unk>","<s>","</s>"
trainable_tokens = ['<unk>','<s>','</s>']
trainable_tokens_vec = np.random.uniform(-1,1,(num_trainable_tokens,vector_size))
for line_no in range(num_trainable_tokens):
add_word(trainable_tokens[line_no],trainable_tokens_vec[line_no])
# 读取 预加载的词向量
for line_no in range(num_trainable_tokens,vocab_size):
line = fr.readline()
parts = line.split(" ")
if len(parts) != vector_size + 1:
logger.error("invalid vector on line %s (is this really the text format?)" % line_no)
continue
word = parts[0]
vec = [np.float32(x) for x in parts[1:]]
add_word(word,vec)
# 结果校验
if self.vectors.shape[0] != len(self.vocab):
logger.info(
"duplicate words detected, shrinking matrix size from %i to %i",
self.vectors.shape[0], len(self.vocab)
)
assert (len(self.vocab), vector_size) == self.vectors.shape
logger.info("loaded %s matrix from %s", self.vectors.shape, fname)
self.vocab_size = vocab_size
self.vector_size = vector_size
def get_keras_embedding(self, train_embeddings=False):
try:
from keras.layers import Embedding
except ImportError:
raise ImportError("Please install Keras to use this function")
weights = self.vectors
layer = Embedding(
input_dim=weights.shape[0], output_dim=weights.shape[1],
weights=[weights], trainable=train_embeddings
)
return layer
参考文献: