文章目录
使用Gensim库构造中文维基百度数据词向量word2vec模型
1. 数据获取
2.xml格式转txt格式
process.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import logging
import os.path
import sys
from gensim.corpora import WikiCorpus
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 3:
print(globals()['__doc__'] % locals())
sys.exit(1)
inp, outp = sys.argv[1:3]
i = 0
output = open(outp, 'w', encoding='utf-8')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
s = "".join(text)
output.write(s + "\n")
i = i + 1
if i % 10000 == 0:
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
# 在cmd中当前目录执行命令:
# python process.py 文件名.xml.bz2 输出名.text
# python process.py zhwiki-latest-pages-articles.xml.bz2 wiki.zh.text
在cmd中当前目录执行命令:
python process.py 文件名.xml.bz2 输出名.text
3. 查看数据,转换为简体数据格式(工具)
- 下载地址:https://bintray.com/package/files/byvoid/opencc/OpenCC
- 使用:opencc.exe -i wiki_texts_tw.txt -o wiki.zh.jian.text -c t2s.json
4. 文档按行分词保存成文件
TestJieba.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import jieba
import codecs
file = "wiki.zh.jian.text"
target = "./data/zhi.jian.wiki.seg-1.3.txt"
f = codecs.open(file, 'r', encoding="utf8")
target = codecs.open(target, 'w', encoding="utf8")
print('open files')
line_num = 1
line = f.readline()
while line:
print('----processing', line_num, 'article--------------')
line_seg = " ".join(jieba.cut(line))
target.writelines(line_seg)
line_num = line_num + 1
line = f.readline()
f.close()
target.close()
5. 模型构建
word2vec_model.py
import logging
import os.path
import sys
import multiprocessing
from gensim.corpora import WikiCorpus
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
if __name__ == '__main__':
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
# check and process input arguments
if len(sys.argv) < 4:
print (globals()['__doc__'] % locals())
sys.exit(1)
inp, outp1, outp2 = sys.argv[1:4]
model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.model.wv.save_word2vec_format(outp2, binary=False)
# python word2vec_model.py zh.jian.wiki.seg.txt wiki.zh.text.model wiki.zh.text.vector
# opencc -i wiki_texts.txt -o test.txt -c t2s.json
6. 测试文本相似度
testModel.py
from gensim.models import Word2Vec
en_wiki_word2vec_model = Word2Vec.load('wiki.zh.text.model')
testwords = ['苹果','数学','学术','白痴','篮球']
for i in range(5):
res = en_wiki_word2vec_model.most_similar(testwords[i])
print (testwords[i])
print (res)
jupyter 进行跑代码
1. 数据获取
2.xml格式转txt格式
import logging
import os
import sys
from gensim.corpora import WikiCorpus
def process(inp, outp):
logger = logging.getLogger("process")
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join([inp, outp]))
if not os.path.exists(inp):
logger.info("文件不存在: %s" % inp)
return
i = 0
output = open(outp, 'w', encoding='utf-8')
wiki = WikiCorpus(inp, lemmatize=False, dictionary={})
for text in wiki.get_texts():
s = "".join(text)
output.write(s + "\n")
i = i + 1
if i % 10000 == 0:
logger.info("Saved " + str(i) + " articles")
output.close()
logger.info("Finished Saved " + str(i) + " articles")
# xml格式转txt格式
inp_path = "./data/zhwiki-latest-pages-articles.xml.bz2"
outp_path = "./data/wiki.zh.text"
process(inp_path, outp_path)
f = codecs.open('./data/wiki.zh.text', 'r', encoding="utf8")
line = f.readline()[:100]
print(line)
歐幾里得 西元前三世紀的希臘數學家 現在被認為是幾何之父 此畫為拉斐爾的作品 雅典學院 数学 是利用符号语言研究數量 结构 变化以及空间等概念的一門学科 从某种角度看屬於形式科學的一種 數學透過抽象化
3. 转换为简体数据格式
- 打开cmd,切换至opencc-1.0.1-win64目录
- 执行: opencc.exe -c t2s.json -i …/data/wiki.zh.text -o …/data/wiki.zh.jian.text
f = codecs.open('./data/wiki.zh.jian.text', 'r', encoding="utf8")
line = f.readline()[:100]
print(line)
欧几里得 西元前三世纪的希腊数学家 现在被认为是几何之父 此画为拉斐尔的作品 雅典学院 数学 是利用符号语言研究数量 结构 变化以及空间等概念的一门学科 从某种角度看属于形式科学的一种 数学透过抽象化
4. 文档按行分词保存成文件
import jieba
import codecs
def test_jieba(file, target):
f = codecs.open(file, 'r', encoding="utf8")
target = codecs.open(target, 'w', encoding="utf8")
print('open files')
line_num = 1
line = f.readline()
while line:
print('----processing', line_num, 'article--------------')
line_seg = " ".join(jieba.cut(line))
target.writelines(line_seg)
line_num = line_num + 1
line = f.readline()
f.close()
target.close()
file_path = "./data/wiki.zh.jian.text"
target_path = "./data/zhi.jian.wiki.seg-1.3g.txt"
test_jieba(file_path, target_path)
f = codecs.open('./data/zh.jian.wiki.seg-1.3g.txt', 'r', encoding="utf8")
line = f.readline()[:100]
print(line)
欧几里得 西元前 三 世纪 的 希腊 数学家 现在 被 认为 是 几何 之 父 此画 为 拉斐尔 的 作品 雅典 学院 数学 是 利用 符号语言 研究 数量 结构 变化
5. 模型构建
# -*- coding: utf-8 -*-
import logging
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
def word2vec_model(inp, outp1, outp2):
logger = logging.getLogger("word2vec_model")
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % 'word2vec_model')
model = Word2Vec(LineSentence(inp), size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.model.wv.save_word2vec_format(outp2, binary=False)
inp_path = "./data/zh.jian.wiki.seg-1.3g.txt"
outp1_path = "./model/wiki.zh.text.model"
outp2_path = "./model/wiki.zh.text.vector"
word2vec_model(inp_path, outp1_path, outp2_path)
6. 加载模型
from gensim.models import Word2Vec
model_path = './model/wiki.zh.text.model'
en_wiki_word2vec_model = Word2Vec.load(model_path)
7. 测试文本相似度
testwords = ['苹果', '数学', '学术', '白痴', '篮球']
for i in range(len(testwords)):
res = en_wiki_word2vec_model.most_similar(testwords[i])
print(testwords[i])
print(res)
d:\progra~2\python\virtua~1\py37_x64\lib\site-packages\ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `most_similar` (Method will be removed in 4.0.0, use self.wv.most_similar() instead).
This is separate from the ipykernel package so we can avoid doing imports until
苹果
[('apple', 0.5233932733535767), ('洋葱', 0.48527854681015015), ('冰淇淋', 0.4787825345993042), ('咬一口', 0.47303444147109985), ('西打', 0.46522432565689087), ('苹果公司', 0.46442875266075134), ('果冻', 0.45820751786231995), ('核战', 0.45506715774536133), ('士多啤梨', 0.4496016502380371), ('饼干', 0.44688838720321655)]
数学
[('算术', 0.7038295269012451), ('微积分', 0.6944979429244995), ('数学分析', 0.6483767032623291), ('逻辑学', 0.6402850151062012), ('概率论', 0.6359573602676392), ('高等数学', 0.6353551149368286), ('数论', 0.6197945475578308), ('拓扑学', 0.6099287271499634), ('统计学', 0.6068518161773682), ('几何学', 0.6068260669708252)]
学术
[('学术研究', 0.7260937690734863), ('汉学', 0.5959312915802002), ('学术界', 0.5891205072402954), ('学术思想', 0.5861917734146118), ('教研', 0.5694284439086914), ('史学', 0.5686976909637451), ('科研', 0.5647758841514587), ('学术交流', 0.5621337294578552), ('历史学', 0.5590660572052002), ('科学研究', 0.5573357343673706)]
白痴
[('书呆子', 0.6184146404266357), ('疯子', 0.6043859720230103), ('笨蛋', 0.5833420753479004), ('小聪明', 0.5805025696754456), ('爱哭鬼', 0.5668667554855347), ('骗子', 0.5608910322189331), ('傻子', 0.5529211163520813), ('天才', 0.5403788089752197), ('傻瓜', 0.5321439504623413), ('变态', 0.5319797992706299)]
篮球
[('美式足球', 0.6267460584640503), ('男子篮球', 0.5999912619590759), ('冰球', 0.5870977640151978), ('棒球', 0.5742351412773132), ('橄榄球', 0.5719608664512634), ('篮球队', 0.5555766820907593), ('排球', 0.5517430901527405), ('篮球运动', 0.5373520851135254), ('足球', 0.5356222987174988), ('曲棍球', 0.5111767053604126)]