相似度计算的步骤
- 读取文档
- 对要计算的多篇文档进行分词
- 对分词后的文档整理成指定格式,方便后续计算
- 计算出词语的频率
- 【可选】对频率低的词语进行过滤
- 通过语料库建立词典
- 加载要对比的文档
- 将要对比的文档通过doc2bow转换为稀疏向量
- 对稀疏向量进行进一步处理,得到新语料库
- 将新语料库通过tf-idf进行处理,得到tf-idf
- 通过token2id得到特征数
- 稀疏矩阵相似度,从而建立索引
- 得到最终相似度结果
tf-idf模型
封装在gensim库中
doc2bow
defaultdict
from gensim import corpora, models, similarities
import jieba
from collections import defaultdict
path = 'C:/py/mp4/010.Python爬虫数据分析视频教程项目实战(整套原价899)/数据分析/文本相似度_盗墓笔记/data/'
# 读取
doc1 = path + 'dmbj.txt'
doc2 = path + 'gcd.txt'
d1 = open(doc1, encoding='utf-8').read()
d2 = open(doc2, encoding='utf-8').read()
# 分词
words1 = jieba.cut(d1)
words2 = jieba.cut(d2)
# 设置为指定格式
# 词语1 词语2 词语3 ... 词语n
s1 = ''
for w in words1:
s1 += w + ' '
s2 = ''
for w in words2:
s2 += w + ' '
docs = [s1, s2]
# 计算词语的频率
texts = [[w for w in doc.split()]
for doc in docs]
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
# 【可选】对低词频单词过滤
# texts = [[token for token in text if frequency[token] > 3]
# for text in texts]
dict1 = corpora.Dictionary(texts)
# 保存语料库
dict1.save(path + 'dict.txt')
doc3 = path + 'ljm.txt'
d3 = open(doc3, encoding='utf-8').read()
data3 = jieba.cut(d3)
data31 = ''
for item in data3:
data31 += item + ' '
new_doc = data31
# 稀疏向量
new_vec = dict1.doc2bow(new_doc.split(' '))
# 新的语料库
corpus = [dict1.doc2bow(text) for text in texts]
# 得到tfidf
tfidf = models.TfidfModel(corpus)
# 特征数
feature_num = len(dict1.token2id.keys())
# 稀疏矩阵,建立索引
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=feature_num)
# 根据索引得到相似性
sim = index[tfidf[new_vec]]
print(sim)