TF-IDF(3)

#!/usr/bin/python

import os
import sys 
import gzip
#suppose testresult1 contains two article which you want compare, testresult2 is the result of idf
file_input_fd = 'd:/share/testresult1'
idf_dict_fd = 'd:/share/testresult2'

def get_file_handler(f):
    file_in = open(f, 'r', encoding='UTF-8')
    return file_in

idf_map = {}
for line in get_file_handler(idf_dict_fd):
    ss = line.strip().split('\t')
    if len(ss) != 2:
        continue
    word = ss[0].strip()
    idf = ss[1].strip()
    idf_map[word] = float(idf)

for line in get_file_handler(file_input_fd):
    ss = line.strip().split('\t')
    if len(ss) != 2:
        continue

    docid = ss[0].strip()
    context = ss[1].strip()
    tf_map = {}
    for t_word in context.strip().split(' '):
        if t_word not in tf_map:
            tf_map[t_word] = 0
        tf_map[t_word] += 1

    tfidf_map = {}
    for w, tf in tf_map.items():
        if w not in idf_map:
            continue
        idf = idf_map[w]
        tfidf_score = tf * idf
        tfidf_map[w] = tfidf_score

    tmp_list = []
    for key, val in tfidf_map.items():
        tmp_list.append((key, val))
    final_list = sorted(tmp_list, key=lambda x : x[1], reverse=True)[:5]#get 5 words

    word_score_list = []
    for t in final_list:
        word_score_list.append(':'.join([t[0], str(t[1])]))

    print(docid + '\t' + ','.join(word_score_list))

you will see the result doesn't compare the same word, so next step I'll create a word dict(with different category), use the same word to compute tf-idf then compare the similarity of two article.

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

SeasonRun

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值