基于Gensim计算文本相似度

驭风少年君

已于 2022-07-15 21:57:17 修改

阅读量867

点赞数 1

分类专栏：自然语言处理文章标签： TF-IDF 余弦相似度 jieba分词 gensim 文本相似性

于 2022-07-15 13:47:39 首次发布

本文链接：https://blog.csdn.net/qq_44951759/article/details/125802915

版权

自然语言处理专栏收录该内容

21 篇文章

订阅专栏

实现方案1

import jieba
from gensim import corpora, models, similarities
if __name__ == '__main__':
    base_data = [
        "好雨知时节，当春乃发生。随风潜入夜，润物细无声。野径云俱黑，江船火独明。晓看红湿处，花重锦官城。",
        "君问归期未有期，巴山夜雨涨秋池。何当共剪西窗烛，却话巴山夜雨时。",
        "莫听穿林打叶声，何妨吟啸且徐行。竹杖芒鞋轻胜马，谁怕？一蓑烟雨任平生。料峭春风吹酒醒，微冷，山头斜照却相迎。回首向来萧瑟处，归去，也无风雨也无晴。",
        "天街小雨润如酥，草色遥看近却无。最是一年春好处，绝胜烟柳满皇都。",
        "古木阴中系短篷，杖藜扶我过桥东。沾衣欲湿杏花雨，吹面不寒杨柳风。",
        "少年听雨歌楼上。红烛昏罗帐。壮年听雨客舟中。江阔云低、断雁叫西风。 而今听雨僧庐下。鬓已星星也。悲欢离合总无情。一任阶前、点滴到天明。",
        "雨里鸡鸣一两家，竹溪村路板桥斜。妇姑相唤浴蚕去，闲看中庭栀子花。",
        "一夕轻雷落万丝，霁光浮瓦碧参差。有情芍药含春泪，无力蔷薇卧晓枝。"
    ]
    # 1.将base_data中的数据进行遍历后分词
    base_items = [[i for i in jieba.lcut(item)] for item in base_data]
    print(base_items)
    # 2.生成词典
    dictionary = corpora.Dictionary(base_items)
    # 3.通过doc2bow稀疏向量生成语料库
    corpus = [dictionary.doc2bow(item) for item in base_items]
    # 4.通过TF模型算法，计算出tf值
    tf = models.TfidfModel(corpus)
    # 5.通过token2id得到特征数（字典里面的键的个数）
    num_features = len(dictionary.token2id.keys())
    # 6.计算稀疏矩阵相似度，建立一个索引
    index = similarities.MatrixSimilarity(tf[corpus], num_features=num_features)
    # 7.处理测试数据
    test_text = "风雨凄凄，鸡鸣喈喈。既见君子，云胡不夷。风雨潇潇，鸡鸣胶胶。既见君子，云胡不瘳。风雨如晦，鸡鸣不已。既见君子，云胡不喜。"
    test_words = [word for word in jieba.cut(test_text)]
    print(test_words)
    # 8.新的稀疏向量
    new_vec = dictionary.doc2bow(test_words)
    # 9.算出相似度
    sims = index[tf[new_vec]]
    print(list(sims))

[[‘好雨知’, ‘时节’, ‘，’, ‘当春’, ‘乃’, ‘发生’, ‘。’, ‘随风潜入夜’, ‘，’, ‘润物细无声’, ‘。’, ‘野径’, ‘云俱黑’, ‘，’, ‘江船’, ‘火独明’, ‘。’, ‘晓看’, ‘红湿处’, ‘，’, ‘花重锦’, ‘官城’, ‘。’], [‘君问’, ‘归期’, ‘未有’, ‘期’, ‘，’, ‘巴山夜雨’, ‘涨秋池’, ‘。’, ‘何当’, ‘共’, ‘剪’, ‘西窗’, ‘烛’, ‘，’, ‘却’, ‘话’, ‘巴山夜雨’, ‘时’, ‘。’], [‘莫听’, ‘穿林’, ‘打叶声’, ‘，’, ‘何妨’, ‘吟啸且’, ‘徐行’, ‘。’, ‘竹杖芒’, ‘鞋’, ‘轻胜马’, ‘，’, ‘谁’, ‘怕’, ‘？’, ‘一’, ‘蓑’, ‘烟雨任’, ‘平生’, ‘。’, ‘料峭’, ‘春风’, ‘吹’, ‘酒醒’, ‘，’, ‘微冷’, ‘，’, ‘山头’, ‘斜照’, ‘却’, ‘相迎’, ‘。’, ‘回首’, ‘向来’, ‘萧瑟处’, ‘，’, ‘归去’, ‘，’, ‘也’, ‘无’, ‘风雨’, ‘也无晴’, ‘。’], [‘天街’, ‘小雨’, ‘润’, ‘如’, ‘酥’, ‘，’, ‘草色’, ‘遥看’, ‘近’, ‘却’, ‘无’, ‘。’, ‘最是’, ‘一年’, ‘春’, ‘好处’, ‘，’, ‘绝胜’, ‘烟柳’, ‘满皇’, ‘都’, ‘。’], [‘古木’, ‘阴中系’, ‘短篷’, ‘，’, ‘杖’, ‘藜’, ‘扶’, ‘我’, ‘过桥’, ‘东’, ‘。’, ‘沾衣’, ‘欲’, ‘湿’, ‘杏花’, ‘雨’, ‘，’, ‘吹面’, ‘不寒’, ‘杨柳风’, ‘。’], [‘少年’, ‘听雨歌’, ‘楼上’, ‘。’, ‘红烛’, ‘昏罗帐’, ‘。’, ‘壮年’, ‘听雨’, ‘客舟’, ‘中’, ‘。’, ‘江阔’, ‘云低’, ‘、’, ‘断雁叫’, ‘西风’, ‘。’, ’ ', ‘而今’, ‘听雨僧’, ‘庐下’, ‘。’, ‘鬓’, ‘已’, ‘星星’, ‘也’, ‘。’, ‘悲欢离合’, ‘总’, ‘无情’, ‘。’, ‘一任’, ‘阶前’, ‘、’, ‘点滴’, ‘到’, ‘天明’, ‘。’], [‘雨里’, ‘鸡鸣’, ‘一两家’, ‘，’, ‘竹溪’, ‘村路’, ‘板桥’, ‘斜’, ‘。’, ‘妇姑’, ‘相唤’, ‘浴蚕’, ‘去’, ‘，’, ‘闲’, ‘看中’, ‘庭’, ‘栀子花’, ‘。’], [‘一夕’, ‘轻雷落’, ‘万丝’, ‘，’, ‘霁’, ‘光’, ‘浮瓦’, ‘碧’, ‘参差’, ‘。’, ‘有情’, ‘芍药’, ‘含春泪’, ‘，’, ‘无力’, ‘蔷薇’, ‘卧晓枝’, ‘。’]]
[‘风雨凄凄’, ‘，’, ‘鸡鸣’, ‘喈’, ‘喈’, ‘。’, ‘既见’, ‘君子’, ‘，’, ‘云’, ‘胡不夷’, ‘。’, ‘风’, ‘雨潇潇’, ‘，’, ‘鸡鸣’, ‘胶胶’, ‘。’, ‘既见’, ‘君子’, ‘，’, ‘云胡’, ‘不’, ‘瘳’, ‘。’, ‘风雨如晦’, ‘，’, ‘鸡鸣不已’, ‘。’, ‘既见’, ‘君子’, ‘，’, ‘云’, ‘胡不喜’, ‘。’]
[0.012518234, 0.006028821, 0.013035861, 0.0059479754, 0.0058894763, 0.0, 0.25966725, 0.0064892126

实现方案2

from jieba import lcut
from gensim.similarities import SparseMatrixSimilarity
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
# 文本集和搜索词
texts = ['吃鸡这里所谓的吃鸡并不是真的吃鸡，也不是谐音词刺激的意思',
         '而是出自策略射击游戏《绝地求生：大逃杀》里的台词',
         '我吃鸡翅，你吃鸡腿']
keyword = '玩过吃鸡？今晚一起吃鸡'
# 1、将【文本集】生成【分词列表】
texts = [lcut(text) for text in texts]
# 2、基于文本集建立【词典】，并获得词典特征数
dictionary = Dictionary(texts)
num_features = len(dictionary.token2id)
# 3.1、基于词典，将【分词列表集】转换成【稀疏向量集】，称作【语料库】
corpus = [dictionary.doc2bow(text) for text in texts]
# 3.2、同理，用【词典】把【搜索词】也转换为【稀疏向量】
kw_vector = dictionary.doc2bow(lcut(keyword))
# 4、创建【TF-IDF模型】，传入【语料库】来训练
tfidf = TfidfModel(corpus)
# 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
tf_kw = tfidf[kw_vector]
# 6、相似度计算
sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
similarities = sparse_matrix.get_similarities(tf_kw)
for e, s in enumerate(similarities, 1):
    print('kw 与 text%d 相似度为：%.2f' % (e, s))

kw 与 text1 相似度为：0.65
kw 与 text2 相似度为：0.00
kw 与 text3 相似度为：0.12

实现方案3

from gensim import corpora, models, similarities
import jieba
text1 = '无痛人流并非无痛'
text2 = '北方人流浪到南方'
texts = [text1, text2]
keyword = '无痛人流'
texts = [jieba.lcut(text) for text in texts]
dictionary = corpora.Dictionary(texts)
num_features = len(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
new_vec = dictionary.doc2bow(jieba.lcut(keyword))
# 相似度计算
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features)
print('\nTF-IDF模型的稀疏向量集：')
for i in tfidf[corpus]:
    print(i)
print('\nTF-IDF模型的keyword稀疏向量：')
print(tfidf[new_vec])
print('\n相似度计算：')
sim = index[tfidf[new_vec]]
for i in range(len(sim)):
    print('第', i+1, '句话的相似度为：', sim[i])

TF-IDF模型的稀疏向量集：
[(0, 0.4082482904638631), (1, 0.4082482904638631), (2, 0.8164965809277261)]
[(3, 0.4472135954999579), (4, 0.4472135954999579), (5, 0.4472135954999579), (6, 0.4472135954999579), (7, 0.4472135954999579)]

TF-IDF模型的keyword稀疏向量：
[(0, 0.7071067811865475), (2, 0.7071067811865475)]

相似度计算：
第 1 句话的相似度为： 0.8660254
第 2 句话的相似度为： 0.0

余弦相似度计算

import jieba
import jieba.analyse
 
def words2vec(words1=None, words2=None):
  v1 = []
  v2 = []
  tag1 = jieba.analyse.extract_tags(words1, withWeight=True)
  tag2 = jieba.analyse.extract_tags(words2, withWeight=True)
  tag_dict1 = {i[0]: i[1] for i in tag1}
  tag_dict2 = {i[0]: i[1] for i in tag2}
  merged_tag = set(tag_dict1.keys()) | set(tag_dict2.keys())
  for i in merged_tag:
    if i in tag_dict1:
      v1.append(tag_dict1[i])
    else:
      v1.append(0)
    if i in tag_dict2:
      v2.append(tag_dict2[i])
    else:
      v2.append(0)
  return v1, v2
 
 
def cosine_similarity(vector1, vector2):
  dot_product = 0.0
  normA = 0.0
  normB = 0.0
  for a, b in zip(vector1, vector2):
    dot_product += a * b
    normA += a ** 2
    normB += b ** 2
  if normA == 0.0 or normB == 0.0:
    return 0
  else:
    return round(dot_product / ((normA**0.5)*(normB**0.5)) * 100, 2)
   
def cosine(str1, str2):
  vec1, vec2 = words2vec(str1, str2)
  return cosine_similarity(vec1, vec2)
 
print(cosine('我喜欢哈哈哈', '我哈哈哈'))