python tfidf原理简单实现 根据http://www.jb51.net/article/64695.htm 上面的原理,进行简单的注解,让自己进行了梳理
# coding:utf-8
# 改变自:http://www.jb51.net/article/64695.htm 并进行了注解
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
'''
min_df:的含义
min_df is used for removing terms that appear too infrequently. For example:
•min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
•min_df = 5 means "ignore terms that appear in less than 5 documents".
The default min_df is 1, which means "ignore terms that appear in less than 1 document".Thus, the default setting does not ignore any terms.
max_df:的含义
max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:
•max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
•max_df = 25 means "ignore terms that appear in more than 25 documents".
The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.
'''
vectorizer = CountVectorizer(min_df=1)
# vectorizer
# CountVectorizer(analyzer='word', binary=False, charset=None,
# charset_error=None, decode_error='strict',
# dtype=< 'numpy.int64'>, encoding='utf-8', input='content',
# lowercase=True, max_df=1.0, max_features=None, min_df=1,
# ngram_range=(1, 1), preprocessor=None, stop_words=None,
# strip_accents=None, token_pattern=...'(?u)\\b\\w\\w+\\b',
# tokenizer=None, vocabulary=None)
# 文章list 列表
mydoclist = [u'温馨 提示 : 家庭 畅享 套餐 介绍 、 主卡 添加 / 取消 副 卡 短信 办理 方式 , 可 点击 文档 左上方 短信 图标 即可 将 短信 指令 发送给 客户',
u'客户 申请 i 我家 , 家庭 畅享 计划 后 , 可 选择 设置 1 - 6 个 同一 归属 地 的 中国移动 网 内 号码 作为 亲情 号码 , 组建 一个 家庭 亲情 网 家庭 内 ',
u'所有 成员 可 享受 本地 互打 免费 优惠 , 家庭 主卡 号码 还 可 享受 省内 / 国内 漫游 接听 免费 的 优惠']
# 根据文章list 取得每个doc的词典字典 并标记每个词所出现的次数
for doc in mydoclist:
tf = Counter()
for word in doc.split():
tf[word] +=1
print tf.items()
import string #allows for format()
# 根据文章list 取得词典集合
def build_lexicon(corpus):
lexicon = set()
for doc in corpus:
lexicon.update([word for word in doc.split()])
return lexicon
def tf(term, document):
return freq(term, document)
def freq(term, document):
return document.split().count(term)
vocabulary = build_lexicon(mydoclist)
doc_term_matrix = []
print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']'
# 计算 词典中 每个词每篇doc中出现的次数 计算每篇doc 词典中 词出词的tf值
for doc in mydoclist:
print 'The doc is "' + doc + '"'
tf_vector = [tf(word, doc) for word in vocabulary]
tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
print 'The tf vector for Document %d is [%s]' % ((mydoclist.index(doc)+1), tf_vector_string)
doc_term_matrix.append(tf_vector)
# here's a test: why did I wrap mydoclist.index(doc)+1 in parens? it returns an int...
# try it! type(mydoclist.index(doc) + 1)
print 'All combined, here is our master document term matrix: '
print doc_term_matrix
# 由于前面 只是计算出了 文章的空间向量,没有标准化;接下来标准化向量,使其L2范数为1
import math
import numpy as np
def l2_normalizer(vec):
denom = np.sum([el**2 for el in vec]) #计算doc 的L2范数
return [(el / math.sqrt(denom)) for el in vec] #对 doc进行标准化
doc_term_matrix_l2 = []
for vec in doc_term_matrix:
doc_term_matrix_l2.append(l2_normalizer(vec))
print 'A regular old document term matrix: '
print np.matrix(doc_term_matrix)
print '\nA document term matrix with row-wise L2 norms of 1:'
print np.matrix(doc_term_matrix_l2)
def numDocsContaining(word, doclist):
doccount = 0
for doc in doclist:
if freq(word, doc) > 0:
doccount +=1
return doccount
# 计算词的 idf值
def idf(word, doclist):
n_samples = len(doclist)
df = numDocsContaining(word, doclist)
return np.log(n_samples / 1+df)
my_idf_vector = [idf(word, mydoclist) for word in vocabulary] #计算词的 idf值
print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']'
print 'The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']'
import numpy as np
def build_idf_matrix(idf_vector):
idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
np.fill_diagonal(idf_mat, idf_vector) #把矩阵根据 idf_vector 对角化
return idf_mat
my_idf_matrix = build_idf_matrix(my_idf_vector)
doc_term_matrix_tfidf = []
#performing tf-idf matrix multiplication
for tf_vector in doc_term_matrix:
doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix)) #两个矩阵相乘 即对doc进行tf*idf
#normalizing
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))
print vocabulary
print np.matrix(doc_term_matrix_tfidf_l2) # np.matrix() just to make it easier to look at