提要
- 最大的收获就是实现了自己的TFIDF,这样的情况下,进行文本相似度的分析就更加好了,本篇博客主要内容会包括
- 自己的TFIDF算法的实现
- 一个使用自己TFIDF的余弦相似度算法的实现
本周总结
1、TFIDF算法的实现,主要思路就是根据tf-idf算法的内容进行程序的设计,然后提供必要的接口方便别的程序的调用。
这部分代码借鉴了知乎中的一篇文章,给出地址:
https://zhuanlan.zhihu.com/p/26766008
详细的实现方式可以借鉴上述文章,不过他的是英文版,我的是中文版。
import math
from collections import Counter
import jieba
#标点符合去掉
excludes = {',','。','/','《','》','?',';','‘',':','“','【','】','{','}',
'、','|','!','@','#','¥','%','……','&','*','(',')','-','=',
'——','+','·','~','”',
',','.','/','<','>','?',';','\'',':','"','[',']','{','}','\\','|',
'~','`','!','@','#','$','%','^','&','*','(',')','_','-','+','=',
' ','\n'}
# tfidf 算法
def tf(word,count):
return count[word]/sum(count.values())
def n_containing(word,count_list):
return sum(1 for count in count_list if word in count)
def idf(word,count_list):
return math.log10(len(count_list)/(1+n_containing(word,count_list)))
def tfidf(word,count,count_list):
return tf(word,count)*idf(word,count_list)
def getThetxt(str):
d1 = open(str,encoding='utf-8').read()
for word in excludes:
d1 = d1.replace(word,'')
#print(d1)
return d1
#数据处理,将字符串分词
def dataprocess(data):
sentence_list = [word for word in jieba.cut(data)]
count = Counter(sentence_list)
return count
#计算数据库中第location+1的TFIDF
# location 是int database 是list [获得字符串,去掉标点,尚未分词]
def getTheTFIDF(location,database):
if location>=len(database):
print("location is wrong!")
return ;
wordlist = []
for data in database :
wordlist.append(dataprocess(data))
for i,count in enumerate(wordlist):
if i==location:
print("Top words in document{}:".format(i + 1))
# 计算每个词的tfidf
# 出现负值,原因 idf是 负的,所有文档中都有,所以应该挑选所有大于0的
# scores = {word: tfidf(word, count, wordlist) for word in count}
scores = {}
for word in count:
width = tfidf(word, count, wordlist)
if width > 0:
scores[word] = width
else:
scores[word] = 0
sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)
for word, score in sorted_words[:10]:
print('\tWord:{},TF-IDF:{}'.format(word, round(score, 5)))
return sorted_words;
2、通过上述算法实现自己的TFIDF-cosine,但是运行结果不尽人意,因为自己的库太小了,所以导致idf的计算不合适。基本的思路和之前的余弦相似度类似。
import math
from tfidfOfM import getTheTFIDF
excludes = {',','。','/','《','》','?',';','‘',':','“','【','】','{','}',
'、','|','!','@','#','¥','%','……','&','*','(',')','-','=',
'——','+','·','~','”',
',','.','/','<','>','?',';','\'',':','"','[',']','{','}','\\','|',
'~','`','!','@','#','$','%','^','&','*','(',')','_','-','+','=',
' ','\n'}
def getThetxt(str):
d1 = open(str,encoding='utf-8').read()
for word in excludes:
d1 = d1.replace(word,'')
return d1
def compute_cosine(dic_f, dic_s):
# print("dic_f:{}".format(dic_f))
# print("dic_s:{}".format(dic_s))
first_dict = {}
for word in dic_f:
first_dict[word[0]] = word[1]
second_dict = {}
for word in dic_s:
second_dict[word[0]] = word[1]
#得到词向量(两者的所有的)
keyWords = []
for i in range(len(dic_f)):
keyWords.append(dic_f[i][0]) #添加数组(都是字)
for i in range(len(dic_s)):
if dic_s[i][0] in keyWords: #有的话啥也不干
pass
else: #没有就加入
keyWords.append(dic_s[i][0])
vect_f = []
vect_s = []
for word in keyWords:
if word in first_dict:
vect_f.append(first_dict[word])
else:
vect_f.append(0)
if word in second_dict:
vect_s.append(second_dict[word])
else:
vect_s.append(0)
print("vect_f:{}".format(vect_f))
print("vect_s:{}".format(vect_s))
#开始计算余弦相似度
sum = 0
sq1 = 0
sq2 = 0
for i in range(len(vect_f)):
sum += vect_f[i] * vect_s[i]
sq1 += pow(vect_f[i],2)
sq2 += pow(vect_s[i],2)
try: #round() 四舍五入,结果保留两位小数
result = round(float(sum)/(math.sqrt(sq1)*math.sqrt(sq2)),2)
except ZeroDivisionError:
result = 0.0
return result
if __name__ == '__main__':
first = 'C:/Users/60917/Desktop/a.txt'
second = 'C:/Users/60917/Desktop/b.txt'
third = 'C:/Users/60917/Desktop/c.txt'
forth = 'C:/Users/60917/Desktop/d.txt'
# 获得字符串,去掉标点,尚未分词
f = getThetxt(first)
s = getThetxt(second)
t = getThetxt(third)
fo = getThetxt(forth)
database = [f,s,t,fo ]
Tfidf_list = []
for i in range(len(database)):
Tfidf_list.append(getTheTFIDF(i,database))
print(compute_cosine(Tfidf_list[0],Tfidf_list[1]))
print(compute_cosine(Tfidf_list[1],Tfidf_list[2]))
下周期望
1、代码相似度的判断,至少阅读论文《Comparison and Evaluation of Clone Detection Tools》了解可行性,然后总结实现方式。