以下是我训练word2vec中的代码,主要包括:
1、模型数据的生成,
2、模型训练,两种训练方式
(1)处理成这样的格式:
二维列表,外边的列表将所有的句子token, 里边每个列表是一个句子的token(过滤掉停用词的词列表)
[[w1, w2], [w1,w2]]
#vocab就是上述的二维列表
model = word2vec.Word2Vec(vocab, size=200, window=5, min_count=3, workers=2)
第二种:和上边同样的处理,只是将所有句子的token用空格拼接之后写入txt文件中,一行是一个句子
sentences = word2vec.Text8Corpus(file_path)
print('数据加载完成, 开始训练模型')
model = word2vec.Word2Vec(sentences, size=200, window=5, min_count=3, workers=3)
3、模型的加载
此处我们一般都使用word2vec中提供的KeyedVectors来加载模型,实例代码如下:
if tencent_word2vec_path.endswith('txt'):
print('开始加载txt向量文件')
model = KeyedVectors.load_word2vec_format(tencent_word2vec_path, binary=False, unicode_errors='ignore', limit=500000)
else:
print("加载二进制模型")
model = KeyedVectors.load_word2vec_format(tencent_word2vec_path, binary=True)
from gensim.models import word2vec, KeyedVectors
import pickle
import pkuseg
import os
import gensim
def get_stopword_list(stopword_path):
'''
获取停用词
:param stopword_path:
:return:
'''
with open(stopword_path, 'r', encoding='utf-8') as f:
stopword_list = [line.strip() for line in f if line.strip()!='']
return set(stopword_list)
def create_vocab(path, stopword_list, seg):
'''
将所有的句子数据转化成我们模型需要的数据类型[[], [], []],
:param path: 句子数据的额位置
:param stopword_list: 停用词列表
:param seg: 分词对象
:return: 返回构建好的语料数据
'''
vocab = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
word_list = seg.cut(line)
sen_ls = [word for word in word_list if word not in stopword_list]
if len(sen_ls) > 5:
vocab.append(sen_ls)
print("共构建语料{}篇".format(len(vocab)))
return vocab
def merge_all_stopword(dir_path,export_path):
'''
合并这四部分的停用词
:param dir_path: 原停用词存放的位置
:param export_path: 合并之后的停用词的保存位置
:return: 返回所有的停用词
'''
word_set = set()
if not os.path.exists(export_path):
for path in os.listdir(dir_path):
if path.endswith('.txt'):
file_path = os.path.join(dir_path, path)
with open(file_path, 'r', encoding='utf-8') as f:
for line in f:
sw = line.strip()
if sw != "":
word_set.add(sw)
l = len(word_set)
with open(export_path, 'w', encoding='utf-8') as f:
for index, word in enumerate(word_set):
if index + 1 == l:
f.write(word)
else:
f.write(word+'\n')
print('共有{}个停用词'.format(len(word_set)))
else:
with open(export_path, 'r', encoding='utf-8') as f:
for line in f:
sw = line.strip()
if sw != '':
word_set.add(sw)
return word_set
def train_w2v(vocab, vecter_bin_file_path, vecter_txt_file_path):
'''
训练word2vec词向量,
:param vocab: 语料数据
:param vecter_bin_file_path: 向量的二进制文件的保存位置
:param vecter_txt_file_path: 向量txt文件的保存位置
:return: None
'''
model = word2vec.Word2Vec(vocab, size=200, window=5, min_count=3, workers=2)
model.wv.save_word2vec_format(vecter_bin_file_path, binary=True) # 把词向量保存为二进制形式
model.wv.save_word2vec_format(vecter_txt_file_path, binary=False) # 把词向量保存成txt格式
model.most_similar()
def w2v_test(path, word, entity_dict, four_stopword_path, stopword_export_path, sentence_file_path):
'''
判断模型是否存在,存在的话加载模型进行获取某个词的前n个相似的词, 如果不存在就训练模型,并获取前那个相似的词
:param path: 模型文件保存的位置
:param word: 需要获取相似词的目标词
:return: None
'''
if os.path.exists(path):
print('模型已存在,加载模型')
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True)
else:
seg = pkuseg.pkuseg(user_dict=list(entity_dict.keys()), model_name='medicine')
stopword_list = merge_all_stopword(four_stopword_path, stopword_export_path)
vocab = create_vocab(sentence_file_path, stopword_list, seg)
print("======语料创建成功,训练词向量模型========")
model = train_w2v(vocab, vecter_bin_file_path, vecter_txt_file_path)
print(model.most_similar(word, topn=8))
def train_w2v_use_file(file_path):
'''
语料处理好之后放在txt文件中,传入文件名称之后训练word2vec
:param file_path: 文件路径
:return: None
'''
print("开始加载数据")
sentences = word2vec.Text8Corpus(file_path)
print('数据加载完成, 开始训练模型')
model = word2vec.Word2Vec(sentences, size=200, window=5, min_count=3, workers=3)
model.wv.save_word2vec_format(vecter_bin_file_path, binary=True) # 把词向量保存为二进制形式
model.wv.save_word2vec_format(vecter_txt_file_path, binary=False) # 把词向量保存成txt格式
if __name__ == '__main__':
four_stopword_path = r'D:\User\sdma\data\stopwords-master'
stopword_path = r'D:\User\sdma\data\stopwords-master\cn_stopwords.txt'
entity_dict_path = './data/medical/entity_dict.pkl'
sentence_file_path = './data/medical/sentence_file.txt'
vecter_bin_file_path = './data/w2v/word2vec_talk.bin'
vecter_txt_file_path = './data/w2v/word2vec_talk.txt'
user_dict = './data/medical/user_dict.txt'
stopword_export_path = './data/medical/stopword.txt'
file_path = r'D:\User\sdma\data\Chinese-medical-dialogue-data-master\sentence_file.txt'
entity_dict = pickle.load(open(entity_dict_path, 'rb'))
# # train_w2v(vocab, vecter_bin_file_path, vecter_txt_file_path)
#
# w2v_test(vecter_bin_file_path, '肚子疼', entity_dict, four_stopword_path, stopword_export_path, sentence_file_path)
if not os.path.exists(vecter_bin_file_path):
train_w2v_use_file(file_path)
else:
# model = word2vec.Word2Vec.load_word2vec_format(vecter_bin_file_path, binary=True)
model = KeyedVectors.load_word2vec_format(vecter_bin_file_path, binary=True)
# model = KeyedVectors.load("./data/w2v/word2vec.bin")
print(model.most_similar('睡觉流口水'))