利用gensim计算tf-idf

import json,jieba,os
from gensim import corpora
from gensim import models
def make_corpus(data_path,token):
    document = []
    with open(data_path,'r') as f:
        line = f.readline()
        while line:
            line = line.strip()
            line = json.loads(line)
            for key,values in line.items():
                if values != []:
                    for value in values:
                        document.append(' '.join(jieba.lcut(value.lstrip(token))))
            line = f.readline()
    return document

def make_tf_idf_model(corpus,model_path):
    word_list = []
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    for i in range(len(corpus)):
        word_list.append(corpus[i].split(' '))
    json.dump(word_list, fp=open(os.path.join(model_path,'word_list'), 'w', encoding='utf-8'), ensure_ascii=False)
    dictionary = corpora.Dictionary(word_list)
    new_corpus = [dictionary.doc2bow(text) for text in word_list]
    # # print(new_corpus)
    # from gensim import models
    tfidf = models.TfidfModel(new_corpus)
    tfidf.save(os.path.join(model_path,"my_model.tfidf"))

def load_tf_idf_model(model_path,sentence):

    word_list = json.load(fp=open(os.path.join(model_path,'word_list'),'r',encoding='utf-8'))
    dictionary = corpora.Dictionary(word_list)
    new_dictionary = dictionary.token2id
    new_dictionary = {v:k for k,v in new_dictionary.items()}
    tfidf = models.TfidfModel.load(os.path.join(model_path,'my_model.tfidf'))
    p = sentence
    p_bow = dictionary.doc2bow((p.split()))
    p_tfidf = tfidf[p_bow]
    r_tfidf = []
    for i in p_tfidf:
        r_tfidf.append((new_dictionary.get(i[0]),i[1]))

    return r_tfidf

if __name__ == '__main__':
    # 构建词表
    corpus = make_corpus('/home/enisya/wyy/提取关键词/data/工作经历_exercitation_content','EXERCITATION-CONTENT-')
    # 生成tf-idf模型
    make_tf_idf_model(corpus,model_path='./model')
    # 加载并使用模型
    ti = load_tf_idf_model('model','我 熟练 使用 python django')
    print(ti)
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值