import json,jieba,os
from gensim import corpora
from gensim import models
def make_corpus(data_path,token):
document = []
with open(data_path,'r') as f:
line = f.readline()
while line:
line = line.strip()
line = json.loads(line)
for key,values in line.items():
if values != []:
for value in values:
document.append(' '.join(jieba.lcut(value.lstrip(token))))
line = f.readline()
return document
def make_tf_idf_model(corpus,model_path):
word_list = []
if not os.path.exists(model_path):
os.makedirs(model_path)
for i in range(len(corpus)):
word_list.append(corpus[i].split(' '))
json.dump(word_list, fp=open(os.path.join(model_path,'word_list'), 'w', encoding='utf-8'), ensure_ascii=False)
dictionary = corpora.Dictionary(word_list)
new_corpus = [dictionary.doc2bow(text) for text in word_list]
# # print(new_corpus)
# from gensim import models
tfidf = models.TfidfModel(new_corpus)
tfidf.save(os.path.join(model_path,"my_model.tfidf"))
def load_tf_idf_model(model_path,sentence):
word_list = json.load(fp=open(os.path.join(model_path,'word_list'),'r',encoding='utf-8'))
dictionary = corpora.Dictionary(word_list)
new_dictionary = dictionary.token2id
new_dictionary = {v:k for k,v in new_dictionary.items()}
tfidf = models.TfidfModel.load(os.path.join(model_path,'my_model.tfidf'))
p = sentence
p_bow = dictionary.doc2bow((p.split()))
p_tfidf = tfidf[p_bow]
r_tfidf = []
for i in p_tfidf:
r_tfidf.append((new_dictionary.get(i[0]),i[1]))
return r_tfidf
if __name__ == '__main__':
# 构建词表
corpus = make_corpus('/home/enisya/wyy/提取关键词/data/工作经历_exercitation_content','EXERCITATION-CONTENT-')
# 生成tf-idf模型
make_tf_idf_model(corpus,model_path='./model')
# 加载并使用模型
ti = load_tf_idf_model('model','我 熟练 使用 python django')
print(ti)
利用gensim计算tf-idf
最新推荐文章于 2022-12-26 13:51:33 发布