TokenEmbedder—自定义Embedder
class GloVeEmbedding(TokenEmbedder):
function
- 与Embedding用法相同,代码是完全照着Embedding写的
- word vector将word分成单个char,char vector=sum(vector)/counter(char):词向量和/字符出现次数
- 原Embedding实现中,权重是随机的,对于GloVe中有的词,才替换成GloVe中的词向量,对于没有的词,还是原来的随机的
- GloVeEmbedding能够对没有的词,将char vector和作为word vector,避免使用随机词向量,使预训练文件中未出现的词向量尽可能合理
- 实测:使用的BiDAF框架,设置num_epochs=1未必初始效果较好,但是train_start_acc会好一些,只算是测试一下自定义Embedding
code
@TokenEmbedder.register("glove_embedding") # 与json中tokens里的type相对应
class GloVeEmbedding(TokenEmbedder):
# GloVeEmbedding与Embedding完全相同,但是其中调用了读取预训练文件的函数,该函数需要重写
# 修改__read_embeddings_from_text_file,共改动3处
def _read_embeddings_from_text_file(
file_uri: str, embedding_dim: int, vocab: Vocabulary, namespace: str = "tokens"
) -> torch.FloatTensor:
"""
Read pre-trained word vectors from an eventually compressed text file, possibly contained
inside an archive with multiple files. The text file is assumed to be utf-8 encoded with
space-separated fields: [word] [dim 1] [dim 2] ...
Lines that contain more numerical tokens than `embedding_dim` raise a warning and are skipped.
The remainder of the docstring is identical to `_read_pretrained_embeddings_file`.
"""
tokens_to_keep = set(vocab.get_index_to_token_vocabulary(namespace).values())
vocab_size = vocab.get_vocab_size(namespace)
char_embeddings = {} # 1.添加char embedding
embeddings = {}
# First we read the embeddings from the file, only keeping vectors for the words we need.
logger.info("Reading pretrained embeddings from file")
with EmbeddingsTextFile(file_uri) as embeddings_file:
for line in Tqdm.tqdm(embeddings_file):
token = line.split(" ", 1)[0]
if token in tokens_to_keep:
fields = line.rstrip().split(" ")
if len(fields) - 1 != embedding_dim:
# Sometimes there are funny unicode parsing problems that lead to different
# fields lengths (e.g., a word with a unicode space character that splits
# into more than one column). We skip those lines. Note that if you have
# some kind of long header, this could result in all of your lines getting
# skipped. It's hard to check for that here; you just have to look in the
# embedding_misses_file and at the model summary to make sure things look
# like they are supposed to.
logger.warning(
"Found line with wrong number of dimensions (expected: %d; actual: %d): %s",
embedding_dim,
len(fields) - 1,
line,
)
continue
vector = numpy.asarray(fields[1:], dtype="float32")
# 2.对token中每个字母进行统计,字符出现在词中,则(向量累加,计数)
for char in list(token):
if char in char_embeddings:
char_embeddings[char] = (char_embeddings[char][0] + vector, char_embeddings[char][1] + 1)
else:
char_embeddings[char] = (vector, 1)
embeddings[token] = vector
if not embeddings:
raise ConfigurationError(
"No embeddings of correct dimension found; you probably "
"misspecified your embedding_dim parameter, or didn't "
"pre-populate your Vocabulary"
)
# char vector:向量和/出现次数
char_embeddings = {char: char_embeddings[char][0] / char_embeddings[char][1] for char in char_embeddings}
chars = set(char_embeddings.keys())
all_embeddings = numpy.asarray(list(embeddings.values()))
embeddings_mean = float(numpy.mean(all_embeddings))
embeddings_std = float(numpy.std(all_embeddings))
# Now we initialize the weight matrix for an embedding layer, starting with random vectors,
# then filling in the word vectors we just read.
logger.info("Initializing pre-trained embedding layer")
embedding_matrix = torch.FloatTensor(vocab_size, embedding_dim).normal_(
embeddings_mean, embeddings_std
)
num_tokens_found = 0
index_to_token = vocab.get_index_to_token_vocabulary(namespace)
for i in range(vocab_size):
token = index_to_token[i]
# If we don't have a pre-trained vector for this word, we'll just leave this row alone,
# so the word has a random initialization.
if token in embeddings:
embedding_matrix[i] = torch.FloatTensor(embeddings[token])
num_tokens_found += 1
elif len(set(token) - chars) == 0: # 3.只要字符可以组成该token,字符向量和作为词向量,也算预训练文件中包括该词
embedding_matrix[i] = torch.FloatTensor([char_embeddings[char] for char in list(token)]).sum(dim=-2)
num_tokens_found += 1
else:
logger.debug(
"Token %s was not found in the embedding file. Initialising randomly.", token
)
logger.info(
"Pretrained embeddings were found for %d out of %d tokens", num_tokens_found, vocab_size
)
return embedding_matrix
.json
{
"token_embedders": {
"tokens": {
"type": "glove_embedding",
"trainable": false,
"embedding_dim": 300,
"pretrained_file": "path/glove.6B.300d.txt"
}
}
}
command line
allennlp train -s allennlp_model/model -f --include-package allennlp_model.embedder.glove_embedder allennlp_model/run_glove_embedder.json
–include-package path1.path2.py_file_name