1、原理
文本相似度的度量有很多种方法,特定词出现频度,整体文本风格等。本文将使用tf-idf方式,通过cosin相似度度量两个文本的相似度。 tf为词频代表token frequence idf为你文档频率,代表(所有文档的数目)/包含 该单词的文档出现频率)
1+log(doc_num/doc_contain_thisWord_num)
每个单词的词频逆文档频率的计算方法为tf[word] * idf[word]
将所有文档中的单词构成一个词典,每个单词用t用一个长度为len(文档数)的向量表示,向量中的每一个值具体表示含义如下:如果该单词出现在文档中就用tf-idf值替代当前词,如果该单词未出现在该文档中则用0表示。
文本相似度可以表示为:
s
i
m
i
l
a
r
i
t
y
=
∑
A
∗
B
∑
A
2
∗
∑
B
2
similarity =\frac{ {\sum {A * B}} }{{ \sqrt{ \sum{A^2}}} * { \sqrt{ \sum{B^2}}}}
similarity=∑A2∗∑B2∑A∗B
2、代码
这段代码我就不添加太多注释了,大多可以直接理解的。import nltk
import math
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#创建类
class TextSimilarityExample:
#定义属性
def __init__(self):
self.statments = [
"ruled india",
"So many kindom rlued india",
"Chalukyas ruled inda",
"your kindom is a good king"
]
#获取词频tokenFrequence字典
def TF(self, sentence):
words = nltk.word_tokenize(sentence.lower())
freq = nltk.FreqDist(words)
dictionary = {}
for key in freq.keys():
norm = freq[key]/ float(len(words))
dictionary[key] = norm
return dictionary
#获取逆文档频率
def IDF(self):
def idf(TotalNumberOfDocuments, NumberOfDocumentWithThisWord):
return 1.0 - math.log(TotalNumberOfDocuments/NumberOfDocumentWithThisWord)
numberOfDoc = len(self.statments)
uniqueWords = {}
idfValues = {}
for sentence in self.statments:
for word in nltk.word_tokenize(sentence.lower()):
if word not in uniqueWords:
uniqueWords[word] = 1
else:
uniqueWords[word] += 1
for word in uniqueWords:
idfValues[word] = idf(numberOfDoc, uniqueWords[word])
return idfValues
#根据公式得到词频逆文档频率
def TF_IDF(self, query):
words = nltk.word_tokenize(query.lower())
idf = self.IDF()
vectors = {}
for sentence in self.statments:
tf = self.TF(sentence)
for word in words:
tfv = tf[word] if word in tf else 0.0
idfv = idf[word] if word in idf else 0.0
mul = tfv * idfv
if word not in vectors:
vectors[word] = []
vectors[word].append(mul)
return vectors
def displayVectors(self, vectors):
print(self.statments)
for word in vectors:
print("{} --> {}".format(word, vectors[word]))
def cosineSimilarity(self):
vec = TfidfVectorizer()
matrix = vec.fit_transform(self.statments)
for j in range(1,5):
i = j -1
print("\t similarity of document {} with others".format(i))
similarity = cosine_similarity(matrix[i:j], matrix)
print(similarity)
def demo(self):
query = self.statments[0]
vec = self.TF_IDF(query)
self.displayVectors(vec)
self.cosineSimilarity()
if __name__ == "__main__":
ts = TextSimilarityExample()
ts.demo()