sklearn.feature_extraction.text.TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
"""
tf-idf(t,d) = tf(t,d)*idf(t)
idf(t) = log(n_d/df(d,t))+1
平滑版 idf(t) = log(1+n_d/1+df(d,t))+1
tf(t,d)是tf值,表示某一篇文本d中,词项t的频度,从式子可以看出tf值由词项和文本共同决定.
idf(t)是词项t的idf值计算式,nd表示训练集文本数,df(d,t)表示包含词项t的文档总数,因此idf值与训练集文本总数和包含词项t的文本数有关
对于短文本来说,tf-idf值中的tf部分大部分只能取0和1,数值区别的地方在于idf值,而对于同一个词项来说idf值不管在什么测试文本上都是一样的,
它只是相当于给每个特征词赋予了一个权值,这个权值会减小那些常见词语,提高不太常见的词语。对于短文本来说tf值失去了频次意义,因此tf-idf
值表示的VSM退化成one-hot表示+idf值。
from: https://zhuanlan.zhihu.com/p/67883024
"""
tv = TfidfVectorizer(use_idf=True, smooth_idf=True, norm=None)
train = ["Chinese Beijing Chinese",
"Chinese Chinese Shanghai",
"Chinese Macao",
"Tokyo Japan Chinese"]
tv_fit = tv.fit_transform(train)
print(tv.get_feature_names())
print(tv_fit.toarray())
'''
运行结果:
['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']
[[1.91629073 2. 0. 0. 0. 0. ]
[0. 2. 0. 0. 1.91629073 0. ]
[0. 1. 0. 1.91629073 0. 0. ]
[0. 1. 1.91629073 0. 0. 1.91629073]]
'''
test = ["Chinese Chinese Chinese Tokyo Japan"]
test_fit = tv.transform(test)
print(tv.get_feature_names())
print(test_fit.toarray())
tv = TfidfVectorizer(smooth_idf=False)
tv_fit = tv.fit_transform(train)
feature_names = tv.get_feature_names()
array = tv_fit.toarray()
print(feature_names)
print(array)
print(np.linalg.norm(array, axis=1, ord=2))
'''
['beijing', 'chinese', 'japan', 'macao', 'shanghai', 'tokyo']
[[0.76641418 0.64234672 0. 0. 0. 0. ]
[0. 0.64234672 0. 0. 0.76641418 0. ]
[0. 0.38649524 0. 0.9222914 0. 0. ]
[0. 0.28410924 0.67796827 0. 0. 0.67796827]]
[1. 1. 1. 1.]
'''