tf-idf gensim 文本相似度

tf-idf文本相似度计算

import pickle
import pandas as pd
import jieba
from gensim.models.tfidfmodel import TfidfModel
from gensim import corpora,similarities,models
import time
import os

#读取数据集
with open(os.path.join(os.path.dirname(__file__),'../files/job_info.pk'),'rb') as f:
    job_info=pickle.load(f)
#
# print(job_info.head(3))


#生成模型
start_time = time.time()
texts = list(job_info['jieba_split'])
# print(texts[0])

#根据文档库生成词典
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts] #词频向量,每篇文章的词频
tf_idf_model = TfidfModel(corpus,smartirs='ntc')
corpus_tfidf = word_tf_tdf = list(tf_idf_model[corpus]) #tf-idf 向量
end_time = time.time()
duration = end_time - start_time
print('用时:',duration,'s')


#相似度模型
featurenum = len(dictionary.token2id.keys())  # 通过token2id得到特征数
start = time.time()
index = similarities.SparseMatrixSimilarity(corpus_tfidf, num_features=featurenum)

end = time.time()
duration = end - start
print('用时:',duration,'s')


#计算相似度 example
test = job_info['jieba_split'][1]
# 产生BOW向量
vec = dictionary.doc2bow(test)

test_vec = tf_idf_model[vec]

sim = index.get_similarities(test_vec)
# print(sim.size)
# print(sim[0:10])
print(sim.argsort())
related_doc_indices = sim.argsort()[:-6:-1]
print(related_doc_indices)

#保存模型
if os.path.exists(os.path.join(os.path.dirname(__file__),'../models')):
    print('exist')
else:
    os.mkdir(os.path.join(os.path.dirname(__file__),'../models'))

#保存生成的词
dictionary.save(os.path.join(os.path.dirname(__file__),'../models/train_dictionary.dict'))
#保存生成的模型
tf_idf_model.save(os.path.join(os.path.dirname(__file__),'../models/tf_idf_model.model'))
#保存词频向量
corpora.MmCorpus.serialize(os.path.join(os.path.dirname(__file__),'../models/corpus.mm'), corpus)
#保存索引
index.save(os.path.join(os.path.dirname(__file__),'../models/train_index.index'))

job_info.pk 见百度网盘
链接:https://pan.baidu.com/s/1cWqBF6SuqtGXaDCXDG6JxQ
提取码:bkjl

参考文章:
Python+gensim-文本相似度分析

评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值