使用glove进行词嵌入模型训练

Chloris_

于 2020-12-24 00:27:57 发布

阅读量1k

点赞数

本文链接：https://blog.csdn.net/Chloris_/article/details/111602400

版权

import codecs
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding='utf-8') as f:  
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map

def build_matrix(word_index, path):
    w,embedding_index = read_glove_vecs(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 50))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix


# f为你下载下来的glove/fasttext训练好的模型
embedding_matrix = build_matrix(tokenizer.word_index, r'D:/Python/data/glove.6B.50d.txt')