分词函数
def split_word(document):
"""
分词,去除停用词
"""
stop_words = {":", "的", ",", "”"}
text = []
for word in jieba.cut(document):
if word not in stop_words:
text.append(word)
return text
通过交集并集计算文档相似度
from itertools import combinations
documents = [
"窝趣公寓完成近2亿元B轮融资主打品质和轻松社交的居住环境",
"IBM的区块链副总裁JesseLund:比特币将达到100万美元",
"窝趣公寓完成近2亿元B轮融资"
]
# 计算两两组合的相似度
for doc1, doc2 in combinations(documents, 2):
words1 = split_word(doc1)
words2 = split_word(doc2)
words1_set = set(words1)
words2_set = set(words2)
similar12 = len(words1_set & words2_set) / len(words1_set | words2_set)
print("{:.2f}".format(similar12), doc1, doc2)
计算结果
0.00 窝趣公寓完成近2亿元B轮融资主打品质和轻松社交的居住环境 IBM的区块链副总裁JesseLund:比特币将达到100万美元
0.53 窝趣公寓完成近2亿元B轮融资主打品质和轻松社交的居住环境 窝趣公寓完成近2亿元B轮融资
0.00 IBM的区块链副总裁JesseLund:比特币将达到100万美元 窝趣公寓完成近2亿元B轮融资