tf idf python_python计算tfidf

本例来自mining social web

from math import log

# XXX: Enter in a query term from the corpus variable

QUERY_TERMS = ['mr.', 'green']

def tf(term, doc, normalize=True):

doc = doc.lower().split()

if normalize:

return doc.count(term.lower()) / float(len(doc))

else:

return doc.count(term.lower()) / 1.0

def idf(term, corpus):

num_texts_with_term = len([True for text in corpus if term.lower()

in text.lower().split()])

# tf-idf calc involves multiplying against a tf value less than 0, so it's

# necessary to return a value greater than 1 for consistent scoring.

# (Multiplying two values less than 1 returns a value less than each of

# them.)

try:

return 1.0 + log(float(len(corpus)) / num_texts_with_term)

except ZeroDivisionError:

return 1.0

def tf_idf(term, doc, corpus):

return tf(term, doc) * idf(term, corpus)

corpus = \

{'a': 'Mr. Green killed Colonel Mustard in the study with the candlestick. \

Mr. Green is not a very nice fellow.',

'b': 'Professor Plum has a green plant in his study.',

'c': "Miss Scarlett watered Professor Plum's green plant while he was away \

from his office last week."}

for (k, v) in sorted(corpus.items()):

print k, ':', v

print

# Score queries by calculating cumulative tf_idf score for each term in query

query_scores = {'a': 0, 'b': 0, 'c': 0}

for term in [t.lower() for t in QUERY_TERMS]:

for doc in sorted(corpus):

print 'TF(%s): %s' % (doc, term), tf(term, corpus[doc])

print 'IDF: %s' % (term, ), idf(term, corpus.values())

print

for doc in sorted(corpus):

score = tf_idf(term, corpus[doc], corpus.values())

print 'TF-IDF(%s): %s' % (doc, term), score

query_scores[doc] += score

print

print "Overall TF-IDF scores for query '%s'" % (' '.join(QUERY_TERMS), )

for (doc, score) in sorted(query_scores.items()):

print doc, score

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值