使用gensim包的Word2Vec构造中文词向量,并且计算词的相似度。
注:本文使用的语料是随便拷贝的几段中文语句,一段语料一行。
1、处理中文语料,使用结巴分词,构造列表
import jieba
import codecs
INPUT_RAW = r"F:\BaiduNetdiskDownload\kkb NLP\语料.txt"
OUTPUT_FENCI = r"F:\BaiduNetdiskDownload\kkb NLP\语料-分词.txt"
fin = codecs.open(INPUT_RAW, "r", encoding="gbk")
fout = codecs.open(OUTPUT_FENCI, "w", encoding="gbk")
for line in fin:
cut_line = list(jieba.cut(line))
line_str = " ".join(cut_line)
fout.write(line_str)
fin.close()
fout.close()
2、生成词向量,保存到本地
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
OUTPUT_VEC = r"F:\BaiduNetdiskDownload\kkb NLP\word2vec.bin"
fin = codecs.open(OUTPUT_FENCI, "r", encoding="gbk")
w2v = Word2Vec(sg=1, sentences=LineSentence(fin), min_count=1, iter=2, size=50, window=5)
w2v.wv.save_word2vec_format(OUTPUT_VEC, binary=True)
fin.close()
3、加载词向量模型,测试是否可以
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(OUTPUT_VEC, binary=True)
sim = model.similarity('都','避免')
print(sim)
4、使用装饰者模式,打印计算需要的时间
import time
def printTimeDecorator(f):
def decorator(*args, **kwargs):
startTime = time.time()
res = f(*args, **kwargs)
endTime = time.time()
print(f.__name__ + "运行时间:" + str((endTime-startTime)) + "s")
return res
return decorator
@printTimeDecorator
def run():
model.most_similar('避免')
run()
5、解决原生模型计算相似度速度慢的问题。使用annoy!
import json
import collections
WORD_INDEX = r"F:\BaiduNetdiskDownload\kkb NLP\wordindex.json"
fout = codecs.open(WORD_INDEX, 'w', encoding='utf-8')
wordindex = collections.OrderedDict()
for index,key in enumerate(model.vocab.keys()):
# print(key)
wordindex[key] = index
json.dump(wordindex, fout, indent=4, ensure_ascii=False) #中文不使用默认的ensure_ascii编码
fout.close()
from annoy import AnnoyIndex
INDEX_SAVE = r"F:\BaiduNetdiskDownload\kkb NLP\annoy.index"
annoyIndex = AnnoyIndex(50, 'angular') #索引50维向量
#加载index和向量映射,添加到annoyIndex
index = 0
for key in model.vocab.keys():
annoyIndex.add_item(index, model[key])
index += 1
annoyIndex.build(4) #tree_num设置为4,在内存允许的情况下,越大越好
annoyIndex.save(INDEX_SAVE)
#加载word-index映射,查询的是词,转换为index,然后作为annoyIndex的参数.查询的结果是词的编号
indexWord = {}
fin = codecs.open(WORD_INDEX, 'r', encoding='utf-8')
wordIndex = json.load(fin)
for word,index in wordIndex.items():
indexWord[index] = word
fin.close()
#从文件中加载
annoyIndex2 = AnnoyIndex(50, 'angular')
annoyIndex2.load(INDEX_SAVE)
@printTimeDecorator
def runAnnoy():
indexs = annoyIndex2.get_nns_by_item(wordIndex['避免'], 10)
for index in indexs:
print(indexWord[index])
runAnnoy()