word2vec的应用：模型的训练和两种格式的模型文件的加载

最新推荐文章于 2023-09-21 21:05:05 发布

qq_43031234

最新推荐文章于 2023-09-21 21:05:05 发布

阅读量1.3k

点赞数

分类专栏： python 文章标签：自然语言处理

本文链接：https://blog.csdn.net/qq_43031234/article/details/106925192

版权

python 专栏收录该内容

12 篇文章 0 订阅

订阅专栏

以下是我训练word2vec中的代码，主要包括：
1、模型数据的生成，
2、模型训练，两种训练方式
（1）处理成这样的格式：
二维列表，外边的列表将所有的句子token，里边每个列表是一个句子的token(过滤掉停用词的词列表)
[[w1, w2], [w1,w2]]

 #vocab就是上述的二维列表
 model = word2vec.Word2Vec(vocab, size=200, window=5, min_count=3, workers=2)

第二种：和上边同样的处理，只是将所有句子的token用空格拼接之后写入txt文件中，一行是一个句子

	sentences = word2vec.Text8Corpus(file_path)
    print('数据加载完成， 开始训练模型')
    model = word2vec.Word2Vec(sentences, size=200, window=5, min_count=3, workers=3)

3、模型的加载
此处我们一般都使用word2vec中提供的KeyedVectors来加载模型，实例代码如下：

        if tencent_word2vec_path.endswith('txt'):
            print('开始加载txt向量文件')
            model = KeyedVectors.load_word2vec_format(tencent_word2vec_path, binary=False, unicode_errors='ignore', limit=500000)
        else:
            print("加载二进制模型")
            model = KeyedVectors.load_word2vec_format(tencent_word2vec_path, binary=True)

from gensim.models import word2vec, KeyedVectors
import pickle
import pkuseg
import os
import gensim

def get_stopword_list(stopword_path):
    '''
    获取停用词
    :param stopword_path:
    :return:
    '''
    with open(stopword_path, 'r', encoding='utf-8') as f:
        stopword_list = [line.strip() for line in f if line.strip()!='']
    return set(stopword_list)

def create_vocab(path, stopword_list, seg):
    '''
    将所有的句子数据转化成我们模型需要的数据类型[[], [], []],
    :param path: 句子数据的额位置
    :param stopword_list: 停用词列表
    :param seg: 分词对象
    :return: 返回构建好的语料数据
    '''
    vocab = []
    with open(path, 'r', encoding='utf-8') as f:
        for line in f:
            word_list = seg.cut(line)
            sen_ls = [word for word in word_list if word not in stopword_list]
            if len(sen_ls) > 5:
                vocab.append(sen_ls)
    print("共构建语料{}篇".format(len(vocab)))
    return vocab


def merge_all_stopword(dir_path,export_path):
    '''
    合并这四部分的停用词
    :param dir_path: 原停用词存放的位置
    :param export_path: 合并之后的停用词的保存位置
    :return: 返回所有的停用词
    '''
    word_set = set()
    if not os.path.exists(export_path):
        for path in os.listdir(dir_path):
            if path.endswith('.txt'):
                file_path = os.path.join(dir_path, path)
                with open(file_path, 'r', encoding='utf-8') as f:
                    for line in f:
                        sw = line.strip()
                        if sw != "":
                            word_set.add(sw)
        l = len(word_set)
        with open(export_path, 'w', encoding='utf-8') as f:
            for index, word in enumerate(word_set):
                if index + 1 == l:
                    f.write(word)
                else:
                    f.write(word+'\n')
        print('共有{}个停用词'.format(len(word_set)))
    else:
        with open(export_path, 'r', encoding='utf-8') as f:
            for line in f:
                sw = line.strip()
                if sw != '':
                    word_set.add(sw)
    return word_set

def train_w2v(vocab, vecter_bin_file_path, vecter_txt_file_path):
    '''
    训练word2vec词向量，
    :param vocab: 语料数据
    :param vecter_bin_file_path: 向量的二进制文件的保存位置
    :param vecter_txt_file_path: 向量txt文件的保存位置
    :return: None
    '''
    model = word2vec.Word2Vec(vocab, size=200, window=5, min_count=3, workers=2)
    model.wv.save_word2vec_format(vecter_bin_file_path, binary=True)  # 把词向量保存为二进制形式
    model.wv.save_word2vec_format(vecter_txt_file_path, binary=False)  # 把词向量保存成txt格式
    model.most_similar()


def w2v_test(path, word, entity_dict, four_stopword_path, stopword_export_path, sentence_file_path):
    '''
    判断模型是否存在，存在的话加载模型进行获取某个词的前n个相似的词， 如果不存在就训练模型，并获取前那个相似的词
    :param path: 模型文件保存的位置
    :param word: 需要获取相似词的目标词
    :return: None
    '''
    if os.path.exists(path):
        print('模型已存在，加载模型')
        model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
    else:
        seg = pkuseg.pkuseg(user_dict=list(entity_dict.keys()), model_name='medicine')
        stopword_list = merge_all_stopword(four_stopword_path, stopword_export_path)
        vocab = create_vocab(sentence_file_path, stopword_list, seg)
        print("======语料创建成功，训练词向量模型========")
        model = train_w2v(vocab, vecter_bin_file_path, vecter_txt_file_path)
    print(model.most_similar(word, topn=8))


def train_w2v_use_file(file_path):
    '''
    语料处理好之后放在txt文件中,传入文件名称之后训练word2vec
    :param file_path: 文件路径
    :return: None
    '''
    print("开始加载数据")
    sentences = word2vec.Text8Corpus(file_path)
    print('数据加载完成， 开始训练模型')
    model = word2vec.Word2Vec(sentences, size=200, window=5, min_count=3, workers=3)
    model.wv.save_word2vec_format(vecter_bin_file_path, binary=True)  # 把词向量保存为二进制形式
    model.wv.save_word2vec_format(vecter_txt_file_path, binary=False)  # 把词向量保存成txt格式




if __name__ == '__main__':
    four_stopword_path = r'D:\User\sdma\data\stopwords-master'
    stopword_path = r'D:\User\sdma\data\stopwords-master\cn_stopwords.txt'
    entity_dict_path = './data/medical/entity_dict.pkl'
    sentence_file_path = './data/medical/sentence_file.txt'
    vecter_bin_file_path = './data/w2v/word2vec_talk.bin'
    vecter_txt_file_path = './data/w2v/word2vec_talk.txt'
    user_dict = './data/medical/user_dict.txt'
    stopword_export_path = './data/medical/stopword.txt'
    file_path = r'D:\User\sdma\data\Chinese-medical-dialogue-data-master\sentence_file.txt'

    entity_dict = pickle.load(open(entity_dict_path, 'rb'))


    # # train_w2v(vocab, vecter_bin_file_path, vecter_txt_file_path)
    #
    # w2v_test(vecter_bin_file_path, '肚子疼', entity_dict, four_stopword_path, stopword_export_path, sentence_file_path)
    if not os.path.exists(vecter_bin_file_path):

        train_w2v_use_file(file_path)
    else:
        # model = word2vec.Word2Vec.load_word2vec_format(vecter_bin_file_path, binary=True)
        model = KeyedVectors.load_word2vec_format(vecter_bin_file_path, binary=True)
        # model = KeyedVectors.load("./data/w2v/word2vec.bin")
        print(model.most_similar('睡觉流口水'))