今天尝试了一下用word2vec训练维基百科,参考了很多资料,也踩了很多坑!!我自己没有什么创造,参考别的东西!
1.下载语料
https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2
2.将xml.bz2文件转换成.text文件
https://blog.csdn.net/sinat_29957455/article/details/81432846
3.繁体转简体
墙裂推荐下面的文章,试过很多办法,这个才能解决问题
https://github.com/BYVoid/OpenCC
4.分词,去停词
这个很简单
参考:https://blog.csdn.net/sinat_29957455/article/details/81432846
import logging, jieba, os, re
def get_stopwords():
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
# 加载停用词表
stopword_set = set()
with open("../stopwords.txt", 'r', encoding="utf-8") as stopwords:
for stopword in stopwords:
stopword_set.add(stopword.strip("\n"))
return stopword_set
'''
使用正则表达式解析文本
'''
def parse_zhwiki(read_file_path, save_file_path):
# 过滤掉<doc>
regex_str = "[^<doc.*>$]|[^</doc>$]"
file = open(read_file_path, "r", encoding="utf-8")
# 写文件
output = open(save_file_path, "w+", encoding="utf-8")
content_line = file.readline()
# 获取停用词表
stopwords = get_stopwords()
# 定义一个字符串变量,表示一篇文章的分词结果
article_contents = ""
while content_line:
match_obj = re.match(regex_str, content_line)
content_line = content_line.strip("\n")
if len(content_line) > 0:
if match_obj:
# 使用jieba进行分词
words = jieba.cut(content_line, cut_all=False)
for word in words:
if word not in stopwords:
article_contents += word + " "
else:
if len(article_contents) > 0:
output.write(article_contents + "\n")
article_contents = ""
content_line = file.readline()
output.close()
read_file_path = 'wiki_simple'
save_file_path = 'seg_word'
parse_zhwiki(read_file_path, save_file_path)
5.训练
import logging
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence, Word2Vec
logging.basicConfig(format='%(asctime)s:%(leveltime)s:%(message)s', level=logging.INFO)
# 加载分此后的文本,使用的是Ttext2Corpus类
sentences = word2vec.Text8Corpus(r'seg_words')
# 训练模型,部分参数如下
model = word2vec.Word2Vec(sentences, size=300, hs=1, min_count=1, window=3)
model.save("./w2v.mod")
6.测试
model = "./w2v.mod" # word2vec模型
model = Word2Vec.load(model)
two_corpus = ["妈妈", "爸爸"]
res = model.similarity(two_corpus[0], two_corpus[1])
print("similarity:%.4f" % res)
print('--------------------分割线----------------------')
# 与某个词(李达康)最相近的3个字的词
print(u'与妈妈最相近的3个字的词')
req_count = 5 # 求出5个与李达康相近的3个字的词
res = model.similar_by_word('妈妈', topn=5)
print(res)
与妈妈最相近的3个字的词
[('爸爸', 0.7042794823646545), ('小孩', 0.601629376411438), ('母亲', 0.601071298122406), ('父母', 0.5831761360168457), ('朋友', 0.5770216584205627)]