实战:网页文本向量化
1、词向量的训练
1.1 中文语料预处理
将xml->txt 繁->简 利用结巴进行分词
# -*- coding: utf-8 -*-
from gensim.corpora import WikiCorpus
import jieba
from langconv import *
def my_function():
space = ' '
i = 0
l = []
zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
f = open('./data/reduce_zhiwiki.txt', 'w')
wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
for text in wiki.get_texts():
for temp_sentence in text:
temp_sentence = Converter('zh-hans').convert(temp_sentence)
seg_list = list(jieba.cut(temp_sentence))
for temp_term in seg_list:
l.append(temp_term)
f.write(space.join(l) + '\n')
l = []
i = i + 1
if (i %200 == 0):
print('Saved ' + str(i) + ' articles')
f.close()
if __name__ == '__main__':
my_function()
1.2 利用gensim模块训练词向量
# -*- coding: utf-8 -*-
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def my_function():
wiki_news = open('./data/reduce_zhiwiki.txt', 'r')
model = Word2Vec(LineSentence(wiki_news), sg=0,size=192, window=5, min_count=5, workers=9)
model.save('zhiwiki_news.word2vec')
if __name__ == '__main__':
my_function()
2、段落向量的训练
与训练词向量不同的是无需再对文档进行分词,直接将简体文本保留。doc2vec在训练时能够采用Tag信息更好地辅助训练(表明是同一类doc)输入文档多了一个tag属性。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import gensim.models as g
from gensim.corpora import WikiCorpus
import logging
from langconv import *
#enable logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
docvec_size=192
class TaggedWikiDocument(object):
def __init__(self, wiki):
self.wiki = wiki
self.wiki.metadata = True
def __iter__(self):
import jieba
for content, (page_id, title) in self.wiki.get_texts():
yield g.doc2vec.LabeledSentence(words=[w for c in content for w in jieba.cut(Converter('zh-hans').convert(c))], tags=[title])
def my_function():
zhwiki_name = './data/zhwiki-latest-pages-articles.xml.bz2'
wiki = WikiCorpus(zhwiki_name, lemmatize=False, dictionary={})
documents = TaggedWikiDocument(wiki)
model = g.Doc2Vec(documents, dm=0, dbow_words=1, size=docvec_size, window=8, min_count=19, iter=5, workers=8)
model.save('data/zhiwiki_news.doc2vec')
if __name__ == '__main__':
my_function()
3、计算网页相似度
3.1 word2vec计算网页相似度
基本方法:抽取文本中的关键词(结巴工具包里面的tfidf关键字提取),将关键词向量化,然后将得到的各个词向量相加,最后得到一个词向量总和代表文本的向量化表示,利用总的向量计算文本相似度。
# -*- coding: utf-8 -*-
import jieba.posseg as pseg
from jieba import analyse
def keyword_extract(data, file_name):
tfidf = analyse.extract_tags
keywords = tfidf(data)
return keywords
def getKeywords(docpath, savepath):
with open(docpath, 'r') as docf, open(savepath, 'w') as outf:
for data in docf:
data = data[:len(data)-1]
keywords = keyword_extract(data, savepath)
for word in keywords:
outf.write(word + ' ')
outf.write('\n')
def word2vec(file_name,model):
with codecs.open(file_name, 'r') as f:
word_vec_all = numpy.zeros(wordvec_size)
for data in f:
space_pos = get_char_pos(data, ' ')
first_word=data[0:space_pos[0]]
if model.__contains__(first_word):
word_vec_all= word_vec_all+model[first_word]
for i in range(len(space_pos) - 1):
word = data[space_pos[i]:space_pos[i + 1]]
if model.__contains__(word):
word_vec_all = word_vec_all+model[word]
return word_vec_all
def simlarityCalu(vector1,vector2):
vector1Mod=np.sqrt(vector1.dot(vector1))
vector2Mod=np.sqrt(vector2.dot(vector2))
if vector2Mod!=0 and vector1Mod!=0:
simlarity=(vector1.dot(vector2))/(vector1Mod*vector2Mod)
else:
simlarity=0
return simlarity
if __name__ == '__main__':
model = gensim.models.Word2Vec.load('data/zhiwiki_news.word2vec')
p1 = './data/P1.txt'
p2 = './data/P2.txt'
p1_keywords = './data/P1_keywords.txt'
p2_keywords = './data/P2_keywords.txt'
getKeywords(p1, p1_keywords)
getKeywords(p2, p2_keywords)
p1_vec=word2vec(p1_keywords,model)
p2_vec=word2vec(p2_keywords,model)
print(simlarityCalu(p1_vec,p2_vec))
3.2 doc2vec计算网页相似度
三步走:预处理->文档向量化->.计算文本相似
import gensim.models as g
import codecs
import numpy
import numpy as np
model_path = './data/zhiwiki_news.doc2vec'
start_alpha = 0.01
infer_epoch = 1000
docvec_size = 192
def simlarityCalu(vector1, vector2):
vector1Mod = np.sqrt(vector1.dot(vector1))
vector2Mod = np.sqrt(vector2.dot(vector2))
if vector2Mod != 0 and vector1Mod != 0:
simlarity = (vector1.dot(vector2)) / (vector1Mod * vector2Mod)
else:
simlarity = 0
return simlarity
def doc2vec(file_name, model):
import jieba
doc = [w for x in codecs.open(file_name, 'r', 'utf-8').readlines() for w in jieba.cut(x.strip())]
doc_vec_all = model.infer_vector(doc, alpha=start_alpha, steps=infer_epoch)
return doc_vec_all
if __name__ == '__main__':
model = g.Doc2Vec.load(model_path)
p1 = './data/P1.txt'
p2 = './data/P2.txt'
P1_doc2vec = doc2vec(p1, model)
P2_doc2vec = doc2vec(p2, model)
print(simlarityCalu(P1_doc2vec, P2_doc2vec))