本以为so easy,结果搞了一天,记录一下,方便以后回顾,如果能帮到你那最好了!
1.准备数据与预处理
首先需要一份比较大的中文语料数据,我用的 中文维基百科.
中文维基百科的数据不是太大,xml的压缩文件大约1G左右。首先用 process_wiki_data.py处理这个XML压缩文件,执行
python process_wiki_data.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
以下是代码:process_wiki_data.py
//process_wiki_data.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# process_wiki_data.py 用于解析XML,将XML的wiki数据转换为text格式
import logging
import os.path
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print globals()['__doc__'] % locals()
sys.exit(1)
inp, outp = sys.argv[1:3]
space = " "
i = 0
output = open(outp, 'w')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
output.write(space.join(text) + "\n")
i = i + 1
if (i % 10000 == 0):
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
2.用opencc 把上述的文件中的繁体字转化为简体字
3.用正则表达式提取文章内容并进行分词
执行下面是代码:
- 将opencc的简体字,放在我本地的./data/目录下
- 代码中的文件路径,记得修改
- stopwords.txt停用词表,我放在同一目录下
# -*- coding: UTF-8 -*-
#@IDE :PyCharm
#@Author :dingjingjing
#@Date :2020/8/10
#@Desc :使用中文维基百科语料库训练一个word2vec模型
import logging, jieba, os, re
def get_stopwords():
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
# 加载停用词表
stopword_set = set()
with open("stop_words.txt", 'r', encoding="utf-8") as stopwords: # stopwords.txt停用词表,我放在同一目录下
for stopword in stopwords:
stopword_set.add(stopword.strip("\n"))
return stopword_set
def parse_zhwiki(read_file_path, save_file_path):
regex_str = "[^<doc.*>$]|[^</doc>$]"
file = open(read_file_path, "r", encoding="utf-8")
# 写文件
output = open(save_file_path, "w+", encoding="utf-8")
content_line = file.readline()
# 获取停用词表
stopwords = get_stopwords()
# 定义一个字符串变量,表示一篇文章的分词结果
article_contents = ""
while content_line:
match_obj = re.match(regex_str, content_line)
content_line = content_line.strip("\n")
if len(content_line) > 0:
if match_obj:
# 使用jieba进行分词
words = jieba.cut(content_line, cut_all=False)
for word in words:
if word not in stopwords:
article_contents += word + " "
else:
if len(article_contents) > 0:
output.write(article_contents + "\n")
article_contents = ""
content_line = file.readline()
output.close()
def generate_corpus():
# 将维基百科语料库进行分类
zhwiki_path = "D:/addjj/DataProcess/chuanbingan/data/result.txt" # 加载zhwiki的路径
save_path = "D:/addjj/DataProcess/chuanbingan/data/wiki_corpus" # 保存zhwiki的路径
parse_zhwiki(zhwiki_path, save_path)
def merge_corpus():
# 合并分词后的文件
output = open("D:/addjj/DataProcess/chuanbingan/data/wiki_corpus", "w", encoding="utf-8")
input = "D:/addjj/DataProcess/chuanbingan/data/"
for i in range(3):
file_path = os.path.join(input, str("wiki_corpus0%s" % str(i)))
file = open(file_path, "r", encoding="utf-8")
line = file.readline()
while line:
output.writelines(line)
line = file.readline()
file.close()
output.close()
if __name__ == "__main__":
generate_corpus()
#merge_corpus()
4生成的分词文件用word2vec工具训练
// wiki.zh.text.seg是分词文件
python train_word2vec_model.py wiki.zh.text.seg wiki.zh.text.model wiki.zh.text.vector
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# train_word2vec_model.py用于训练模型
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 4:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp1, outp2 = sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5,
workers=multiprocessing.cpu_count())
# trim unneeded model memory = use(much) less RAM
#model.init_sims(replace=True)
model.save(outp1)
model.save_word2vec_format(outp2, binary=False)