小白~~~word2vec训练维基百科语料

最新推荐文章于 2022-02-20 23:31:31 发布

jiacaiyun123

最新推荐文章于 2022-02-20 23:31:31 发布

阅读量200

点赞数

文章标签： word2vec 词向量训练

今天尝试了一下用word2vec训练维基百科，参考了很多资料，也踩了很多坑！！我自己没有什么创造，参考别的东西！

1.下载语料

https://dumps.wikimedia.org/zhwiki/latest/zhwiki-latest-pages-articles.xml.bz2

2.将xml.bz2文件转换成.text文件

https://blog.csdn.net/sinat_29957455/article/details/81432846

3.繁体转简体

墙裂推荐下面的文章，试过很多办法，这个才能解决问题

https://github.com/BYVoid/OpenCC

4.分词，去停词

这个很简单

参考：https://blog.csdn.net/sinat_29957455/article/details/81432846

import logging, jieba, os, re
def get_stopwords():
    logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
    # 加载停用词表
    stopword_set = set()
    with open("../stopwords.txt", 'r', encoding="utf-8") as stopwords:
        for stopword in stopwords:
            stopword_set.add(stopword.strip("\n"))
    return stopword_set

'''
使用正则表达式解析文本
'''
def parse_zhwiki(read_file_path, save_file_path):
    # 过滤掉<doc>
    regex_str = "[^<doc.*>$]|[^</doc>$]"
    file = open(read_file_path, "r", encoding="utf-8")
    # 写文件
    output = open(save_file_path, "w+", encoding="utf-8")
    content_line = file.readline()
    # 获取停用词表
    stopwords = get_stopwords()
    # 定义一个字符串变量，表示一篇文章的分词结果
    article_contents = ""
    while content_line:
        match_obj = re.match(regex_str, content_line)
        content_line = content_line.strip("\n")
        if len(content_line) > 0:
            if match_obj:
                # 使用jieba进行分词
                words = jieba.cut(content_line, cut_all=False)
                for word in words:
                    if word not in stopwords:
                        article_contents += word + " "
            else:
                if len(article_contents) > 0:
                    output.write(article_contents + "\n")
                    article_contents = ""
        content_line = file.readline()
    output.close()

read_file_path = 'wiki_simple'
save_file_path = 'seg_word'
parse_zhwiki(read_file_path, save_file_path)

5.训练

import logging
from gensim.models import word2vec
from gensim.models.word2vec import LineSentence, Word2Vec
logging.basicConfig(format='%(asctime)s:%(leveltime)s:%(message)s', level=logging.INFO)
# 加载分此后的文本，使用的是Ttext2Corpus类
sentences = word2vec.Text8Corpus(r'seg_words')
# 训练模型，部分参数如下
model = word2vec.Word2Vec(sentences, size=300, hs=1, min_count=1, window=3)
model.save("./w2v.mod")

6.测试

model = "./w2v.mod"  # word2vec模型
model = Word2Vec.load(model)

two_corpus = ["妈妈", "爸爸"]
res = model.similarity(two_corpus[0], two_corpus[1])
print("similarity:%.4f" % res)
print('--------------------分割线----------------------')

# 与某个词(李达康)最相近的3个字的词
print(u'与妈妈最相近的3个字的词')
req_count = 5  # 求出5个与李达康相近的3个字的词
res = model.similar_by_word('妈妈', topn=5)
print(res)

与妈妈最相近的3个字的词
[('爸爸', 0.7042794823646545), ('小孩', 0.601629376411438), ('母亲', 0.601071298122406), ('父母', 0.5831761360168457), ('朋友', 0.5770216584205627)]