# TF-IDF算法解析与Python实现

TF-IDF（term frequency–inverse document frequency）是一种用于信息检索（information retrieval）与文本挖掘（text mining）的常用加权技术。比较容易理解的一个应用场景是当我们手头有一些文章时，我们希望计算机能够自动地进行关键词提取。而TF-IDF就是可以帮我们完成这项任务的一种统计方法。它能够用于评估一个词语对于一个文集或一个语料库中的其中一份文档的重要程度。

## 必要的预处理过程


import nltk
import math
import string
from nltk.corpus import stopwords
from collections import Counter
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer

text1 = "Python is a 2000 made-for-TV horror movie directed by Richard \
Clabaugh. The film features several cult favorite actors, including William \
Zabka of The Karate Kid fame, Wil Wheaton, Casper Van Dien, Jenny McCarthy, \
Keith Coogan, Robert Englund (best known for his role as Freddy Krueger in the \
A Nightmare on Elm Street series of films), Dana Barron, David Bowe, and Sean \
Whalen. The film concerns a genetically engineered snake, a python, that \
escapes and unleashes itself on a small town. It includes the classic final\
girl scenario evident in films like Friday the 13th. It was filmed in Los Angeles, \
California and Malibu, California. Python was followed by two sequels: Python \
II (2002) and Boa vs. Python (2004), both also made-for-TV films."

text2 = "Python, from the Greek word (πύθων/πύθωνας), is a genus of \
nonvenomous pythons[2] found in Africa and Asia. Currently, 7 species are \
recognised.[2] A member of this genus, P. reticulatus, is among the longest \
snakes known."

text3 = "The Colt Python is a .357 Magnum caliber revolver formerly \
manufactured by Colt's Manufacturing Company of Hartford, Connecticut. \
It is sometimes referred to as a \"Combat Magnum\".[1] It was first introduced \
in 1955, the same year as Smith &amp; Wesson's M29 .44 Magnum. The now discontinued \
Colt Python targeted the premium revolver market segment. Some firearm \
collectors and writers such as Jeff Cooper, Ian V. Hogg, Chuck Hawks, Leroy \
Thompson, Renee Smeets and Martin Dougherty have described the Python as the \
finest production revolver ever made."

TF-IDF的基本思想是：词语的重要性与它在文件中出现的次数成正比，但同时会随着它在语料库中出现的频率成反比下降。 但无论如何，统计每个单词在文档中出现的次数是必要的操作。所以说，TF-IDF也是一种基于 bag-of-word 的方法。

def get_tokens(text):
lowers = text.lower()
#remove the punctuation using the character deletion step of translate
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
no_punctuation = lowers.translate(remove_punctuation_map)
tokens = nltk.word_tokenize(no_punctuation)
return tokens

tokens = get_tokens(text1)
count = Counter(tokens)
print (count.most_common(10))

[('the', 6), ('python', 5), ('a', 5), ('and', 4), ('films', 3), ('in', 3),
('madefortv', 2), ('on', 2), ('by', 2), ('was', 2)]

def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed

tokens = get_tokens(text1)
filtered = [w for w in tokens if not w in stopwords.words('english')]
count = Counter(filtered)
print (count.most_common(10))

[('python', 5), ('films', 3), ('film', 2), ('california', 2), ('madefortv', 2),
('genetically', 1), ('horror', 1), ('krueger', 1), ('filmed', 1), ('sean', 1)]

tokens = get_tokens(text1)
filtered = [w for w in tokens if not w in stopwords.words('english')]
stemmer = PorterStemmer()
stemmed = stem_tokens(filtered, stemmer)

count = Counter(stemmed)
print(count)

Counter({'film': 6, 'python': 5, 'madefortv': 2, 'california': 2, 'includ': 2, '2004': 1,
'role': 1, 'casper': 1, 'robert': 1, 'sequel': 1, 'two': 1, 'krueger': 1,
'ii': 1, 'sean': 1, 'lo': 1, 'clabaugh': 1, 'finalgirl': 1, 'wheaton': 1,
'concern': 1, 'whalen': 1, 'cult': 1, 'boa': 1, 'mccarthi': 1, 'englund': 1,
'best': 1, 'direct': 1, 'known': 1, 'favorit': 1, 'movi': 1, 'keith': 1,
'karat': 1, 'small': 1, 'classic': 1, 'coogan': 1, 'like': 1, 'elm': 1,
'fame': 1, 'malibu': 1, 'sever': 1, 'richard': 1, 'scenario': 1, 'town': 1,
'friday': 1, 'david': 1, 'unleash': 1, 'vs': 1, '2000': 1, 'angel': 1, 'nightmar': 1,
'zabka': 1, '13th': 1, 'jenni': 1, 'seri': 1, 'horror': 1, 'william': 1,
'street': 1, 'wil': 1, 'escap': 1, 'van': 1, 'snake': 1, 'evid': 1, 'freddi': 1,
'bow': 1, 'dien': 1, 'follow': 1, 'engin': 1, 'also': 1})

## TF-IDF的算法原理

tfij=ni,jknk,j

TF(t)=Number of times term t appears in a documentTotal number of terms in the document

idfi=log|D||{j:tidj}|

IDF(t)=logeTotal number of documentsNumber of documents with term t in it

def tf(word, count):
return count[word] / sum(count.values())

def n_containing(word, count_list):
return sum(1 for count in count_list if word in count)

def idf(word, count_list):
return math.log(len(count_list) / (1 + n_containing(word, count_list)))

def tfidf(word, count, count_list):
return tf(word, count) * idf(word, count_list)

countlist = [count1, count2, count3]
for i, count in enumerate(countlist):
print("Top words in document {}".format(i + 1))
scores = {word: tfidf(word, count, countlist) for word in count}
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:3]:
print("\tWord: {}, TF-IDF: {}".format(word, round(score, 5)))

Top words in document 1
Word: film, TF-IDF: 0.02829
Word: california, TF-IDF: 0.00943
Top words in document 2
Word: genu, TF-IDF: 0.03686
Word: 7, TF-IDF: 0.01843
Word: among, TF-IDF: 0.01843
Top words in document 3
Word: revolv, TF-IDF: 0.02097
Word: colt, TF-IDF: 0.02097
Word: manufactur, TF-IDF: 0.01398

## 利用Scikit-Learn实现的TF-IDF

>>> corpus = ['This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?',]
>>> vectorizer = TfidfVectorizer(min_df=1)
>>> vectorizer.fit_transform(corpus)
<4x9 sparse matrix of type '<class 'numpy.float64'>'
with 19 stored elements in Compressed Sparse Row format>
>>> vectorizer.get_feature_names()
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
>>> vectorizer.fit_transform(corpus).toarray()
array([[ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
0.        ,  0.35872874,  0.        ,  0.43877674],
[ 0.        ,  0.27230147,  0.        ,  0.27230147,  0.        ,
0.85322574,  0.22262429,  0.        ,  0.27230147],
[ 0.55280532,  0.        ,  0.        ,  0.        ,  0.55280532,
0.        ,  0.28847675,  0.55280532,  0.        ],
[ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
0.        ,  0.35872874,  0.        ,  0.43877674]])

• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120