文章的 tfidf原理的python实现

最新推荐文章于 2024-08-23 17:21:41 发布

phoebe_IT

最新推荐文章于 2024-08-23 17:21:41 发布

阅读量2.2k

点赞数

分类专栏：算法实现

本文链接：https://blog.csdn.net/u012448083/article/details/50956229

版权

算法实现专栏收录该内容

9 篇文章 0 订阅

订阅专栏

python tfidf原理简单实现  根据http://www.jb51.net/article/64695.htm 上面的原理，进行简单的注解，让自己进行了梳理

# coding:utf-8

# 改变自：http://www.jb51.net/article/64695.htm 并进行了注解
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
'''
min_df:的含义
min_df is used for removing terms that appear too infrequently. For example:
•min_df = 0.01 means "ignore terms that appear in less than 1% of the documents".
•min_df = 5 means "ignore terms that appear in less than 5 documents".

The default min_df is 1, which means "ignore terms that appear in less than 1 document".Thus, the default setting does not ignore any terms.

max_df:的含义
max_df is used for removing terms that appear too frequently, also known as "corpus-specific stop words". For example:
•max_df = 0.50 means "ignore terms that appear in more than 50% of the documents".
•max_df = 25 means "ignore terms that appear in more than 25 documents".
The default max_df is 1.0, which means "ignore terms that appear in more than 100% of the documents". Thus, the default setting does not ignore any terms.

'''
vectorizer = CountVectorizer(min_df=1)
# vectorizer

# CountVectorizer(analyzer='word', binary=False, charset=None,
#         charset_error=None, decode_error='strict',
#         dtype=< 'numpy.int64'>, encoding='utf-8', input='content',
#         lowercase=True, max_df=1.0, max_features=None, min_df=1,
#         ngram_range=(1, 1), preprocessor=None, stop_words=None,
#         strip_accents=None, token_pattern=...'(?u)\\b\\w\\w+\\b',
#         tokenizer=None, vocabulary=None)

# 文章list 列表
mydoclist = [u'温馨 提示 ： 家庭 畅享 套餐 介绍 、 主卡 添加 / 取消 副 卡 短信 办理 方式 , 可 点击 文档 左上方  短信  图标 即可 将 短信 指令 发送给 客户',
u'客户 申请 i 我家 ， 家庭 畅享 计划  后 ， 可 选择 设置 1 - 6 个 同一 归属 地 的 中国移动 网 内 号码 作为 亲情 号码 ， 组建 一个 家庭 亲情 网  家庭 内 ',
u'所有 成员 可 享受 本地 互打 免费 优惠 ， 家庭 主卡 号码 还 可 享受 省内 / 国内 漫游 接听 免费 的 优惠']

# 根据文章list 取得每个doc的词典字典 并标记每个词所出现的次数
for doc in mydoclist:
  tf = Counter()
  for word in doc.split():
    tf[word] +=1
  print tf.items()

import string #allows for format()

#  根据文章list 取得词典集合
def build_lexicon(corpus):
  lexicon = set()
  for doc in corpus:
    lexicon.update([word for word in doc.split()])
  return lexicon

def tf(term, document):
 return freq(term, document)

def freq(term, document):
 return document.split().count(term)

vocabulary = build_lexicon(mydoclist)

doc_term_matrix = []
print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']'

# 计算 词典中 每个词每篇doc中出现的次数 计算每篇doc 词典中 词出词的tf值
for doc in mydoclist:
  print 'The doc is "' + doc + '"'
  tf_vector = [tf(word, doc) for word in vocabulary]
  tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
  print 'The tf vector for Document %d is [%s]' % ((mydoclist.index(doc)+1), tf_vector_string)
  doc_term_matrix.append(tf_vector)

  # here's a test: why did I wrap mydoclist.index(doc)+1 in parens? it returns an int...
  # try it! type(mydoclist.index(doc) + 1)

print 'All combined, here is our master document term matrix: '
print doc_term_matrix

# 由于前面 只是计算出了 文章的空间向量，没有标准化；接下来标准化向量，使其L2范数为1
import math
import numpy as np
def l2_normalizer(vec):
  denom = np.sum([el**2 for el in vec]) #计算doc 的L2范数
  return [(el / math.sqrt(denom)) for el in vec] #对 doc进行标准化

doc_term_matrix_l2 = []
for vec in doc_term_matrix:
  doc_term_matrix_l2.append(l2_normalizer(vec))

print 'A regular old document term matrix: '
print np.matrix(doc_term_matrix)
print '\nA document term matrix with row-wise L2 norms of 1:'
print np.matrix(doc_term_matrix_l2)

def numDocsContaining(word, doclist):
  doccount = 0
  for doc in doclist:
    if freq(word, doc) > 0:
      doccount +=1
  return doccount

# 计算词的 idf值
def idf(word, doclist):
  n_samples = len(doclist)
  df = numDocsContaining(word, doclist)
  return np.log(n_samples / 1+df)

my_idf_vector = [idf(word, mydoclist) for word in vocabulary] #计算词的 idf值

print 'Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']'
print 'The inverse document frequency vector is [' + ', '.join(format(freq, 'f') for freq in my_idf_vector) + ']'
import numpy as np

def build_idf_matrix(idf_vector):
  idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
  np.fill_diagonal(idf_mat, idf_vector) #把矩阵根据 idf_vector 对角化
  return idf_mat

my_idf_matrix = build_idf_matrix(my_idf_vector)
doc_term_matrix_tfidf = []

#performing tf-idf matrix multiplication
for tf_vector in doc_term_matrix:
  doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix)) #两个矩阵相乘 即对doc进行tf*idf

#normalizing
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
  doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))

print vocabulary
print np.matrix(doc_term_matrix_tfidf_l2) # np.matrix() just to make it easier to look at