tf-idf文本相似度计算
import pickle
import pandas as pd
import jieba
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora,similarities,models
import time
import os
#读取数据集
with open(os.path.join(os.path.dirname(__file__),'../files/job_info.pk'),'rb') as f:
job_info=pickle.load(f)
#
# print(job_info.head(3))
#生成模型
start_time = time.time()
texts = list(job_info['jieba_split'])
# print(texts[0])
#根据文档库生成词典
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts] #词频向量,每篇文章的词频
tf_idf_model = TfidfModel(corpus,smartirs='ntc')
corpus_tfidf = word_tf_tdf = list(tf_idf_model[corpus]) #tf-idf 向量
end_time = time.time()
duration = end_time - start_time
print('用时:',duration,'s')
#相似度模型
featurenum = len(dictionary.token2id.keys()) # 通过token2id得到特征数
start = time.time()
index = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum)
end = time.time()
duration = end - start
print('用时:',duration,'s')
#计算相似度 example
test = job_info['jieba_split'][1]
# 产生BOW向量
vec = dictionary.doc2bow(test)
test_vec = tf_idf_model[vec]
sim = index.get_similarities(test_vec)
# print(sim.size)
# print(sim[0:10])
print(sim.argsort())
related_doc_indices = sim.argsort()[:-6:-1]
print(related_doc_indices)
#保存模型
if os.path.exists(os.path.join(os.path.dirname(__file__),'../models')):
print('exist')
else:
os.mkdir(os.path.join(os.path.dirname(__file__),'../models'))
#保存生成的词
dictionary.save(os.path.join(os.path.dirname(__file__),'../models/train_dictionary.dict'))
#保存生成的模型
tf_idf_model.save(os.path.join(os.path.dirname(__file__),'../models/tf_idf_model.model'))
#保存词频向量
corpora.MmCorpus.serialize(os.path.join(os.path.dirname(__file__),'../models/corpus.mm'), corpus)
#保存索引
index.save(os.path.join(os.path.dirname(__file__),'../models/train_index.index'))
job_info.pk 见百度网盘
链接:https://pan.baidu.com/s/1cWqBF6SuqtGXaDCXDG6JxQ
提取码:bkjl
参考文章:
Python+gensim-文本相似度分析