基于Gensim计算文本相似度

实现方案1

import jieba
from gensim import corpora, models, similarities
if __name__ == '__main__':
    base_data = [
        "好雨知时节,当春乃发生。随风潜入夜,润物细无声。野径云俱黑,江船火独明。晓看红湿处,花重锦官城。",
        "君问归期未有期,巴山夜雨涨秋池。何当共剪西窗烛,却话巴山夜雨时。",
        "莫听穿林打叶声,何妨吟啸且徐行。竹杖芒鞋轻胜马,谁怕?一蓑烟雨任平生。料峭春风吹酒醒,微冷,山头斜照却相迎。回首向来萧瑟处,归去,也无风雨也无晴。",
        "天街小雨润如酥,草色遥看近却无。最是一年春好处,绝胜烟柳满皇都。",
        "古木阴中系短篷,杖藜扶我过桥东。沾衣欲湿杏花雨,吹面不寒杨柳风。",
        "少年听雨歌楼上。红烛昏罗帐。壮年听雨客舟中。江阔云低、断雁叫西风。 而今听雨僧庐下。鬓已星星也。悲欢离合总无情。一任阶前、点滴到天明。",
        "雨里鸡鸣一两家,竹溪村路板桥斜。妇姑相唤浴蚕去,闲看中庭栀子花。",
        "一夕轻雷落万丝,霁光浮瓦碧参差。有情芍药含春泪,无力蔷薇卧晓枝。"
    ]
    # 1.将base_data中的数据进行遍历后分词
    base_items = [[i for i in jieba.lcut(item)] for item in base_data]
    print(base_items)
    # 2.生成词典
    dictionary = corpora.Dictionary(base_items)
    # 3.通过doc2bow稀疏向量生成语料库
    corpus = [dictionary.doc2bow(item) for item in base_items]
    # 4.通过TF模型算法,计算出tf值
    tf = models.TfidfModel(corpus)
    # 5.通过token2id得到特征数(字典里面的键的个数)
    num_features = len(dictionary.token2id.keys())
    # 6.计算稀疏矩阵相似度,建立一个索引
    index = similarities.MatrixSimilarity(tf[corpus], num_features=num_features)
    # 7.处理测试数据
    test_text = "风雨凄凄,鸡鸣喈喈。既见君子,云胡不夷。风雨潇潇,鸡鸣胶胶。既见君子,云胡不瘳。风雨如晦,鸡鸣不已。既见君子,云胡不喜。"
    test_words = [word for word in jieba.cut(test_text)]
    print(test_words)
    # 8.新的稀疏向量
    new_vec = dictionary.doc2bow(test_words)
    # 9.算出相似度
    sims = index[tf[new_vec]]
    print(list(sims))

[[‘好雨知’, ‘时节’, ‘,’, ‘当春’, ‘乃’, ‘发生’, ‘。’, ‘随风潜入夜’, ‘,’, ‘润物细无声’, ‘。’, ‘野径’, ‘云俱黑’, ‘,’, ‘江船’, ‘火独明’, ‘。’, ‘晓看’, ‘红湿处’, ‘,’, ‘花重锦’, ‘官城’, ‘。’], [‘君问’, ‘归期’, ‘未有’, ‘期’, ‘,’, ‘巴山夜雨’, ‘涨秋池’, ‘。’, ‘何当’, ‘共’, ‘剪’, ‘西窗’, ‘烛’, ‘,’, ‘却’, ‘话’, ‘巴山夜雨’, ‘时’, ‘。’], [‘莫听’, ‘穿林’, ‘打叶声’, ‘,’, ‘何妨’, ‘吟啸且’, ‘徐行’, ‘。’, ‘竹杖芒’, ‘鞋’, ‘轻胜马’, ‘,’, ‘谁’, ‘怕’, ‘?’, ‘一’, ‘蓑’, ‘烟雨任’, ‘平生’, ‘。’, ‘料峭’, ‘春风’, ‘吹’, ‘酒醒’, ‘,’, ‘微冷’, ‘,’, ‘山头’, ‘斜照’, ‘却’, ‘相迎’, ‘。’, ‘回首’, ‘向来’, ‘萧瑟处’, ‘,’, ‘归去’, ‘,’, ‘也’, ‘无’, ‘风雨’, ‘也无晴’, ‘。’], [‘天街’, ‘小雨’, ‘润’, ‘如’, ‘酥’, ‘,’, ‘草色’, ‘遥看’, ‘近’, ‘却’, ‘无’, ‘。’, ‘最是’, ‘一年’, ‘春’, ‘好处’, ‘,’, ‘绝胜’, ‘烟柳’, ‘满皇’, ‘都’, ‘。’], [‘古木’, ‘阴中系’, ‘短篷’, ‘,’, ‘杖’, ‘藜’, ‘扶’, ‘我’, ‘过桥’, ‘东’, ‘。’, ‘沾衣’, ‘欲’, ‘湿’, ‘杏花’, ‘雨’, ‘,’, ‘吹面’, ‘不寒’, ‘杨柳风’, ‘。’], [‘少年’, ‘听雨歌’, ‘楼上’, ‘。’, ‘红烛’, ‘昏罗帐’, ‘。’, ‘壮年’, ‘听雨’, ‘客舟’, ‘中’, ‘。’, ‘江阔’, ‘云低’, ‘、’, ‘断雁叫’, ‘西风’, ‘。’, ’ ', ‘而今’, ‘听雨僧’, ‘庐下’, ‘。’, ‘鬓’, ‘已’, ‘星星’, ‘也’, ‘。’, ‘悲欢离合’, ‘总’, ‘无情’, ‘。’, ‘一任’, ‘阶前’, ‘、’, ‘点滴’, ‘到’, ‘天明’, ‘。’], [‘雨里’, ‘鸡鸣’, ‘一两家’, ‘,’, ‘竹溪’, ‘村路’, ‘板桥’, ‘斜’, ‘。’, ‘妇姑’, ‘相唤’, ‘浴蚕’, ‘去’, ‘,’, ‘闲’, ‘看中’, ‘庭’, ‘栀子花’, ‘。’], [‘一夕’, ‘轻雷落’, ‘万丝’, ‘,’, ‘霁’, ‘光’, ‘浮瓦’, ‘碧’, ‘参差’, ‘。’, ‘有情’, ‘芍药’, ‘含春泪’, ‘,’, ‘无力’, ‘蔷薇’, ‘卧晓枝’, ‘。’]]
[‘风雨凄凄’, ‘,’, ‘鸡鸣’, ‘喈’, ‘喈’, ‘。’, ‘既见’, ‘君子’, ‘,’, ‘云’, ‘胡不夷’, ‘。’, ‘风’, ‘雨潇潇’, ‘,’, ‘鸡鸣’, ‘胶胶’, ‘。’, ‘既见’, ‘君子’, ‘,’, ‘云胡’, ‘不’, ‘瘳’, ‘。’, ‘风雨如晦’, ‘,’, ‘鸡鸣不已’, ‘。’, ‘既见’, ‘君子’, ‘,’, ‘云’, ‘胡不喜’, ‘。’]
[0.012518234, 0.006028821, 0.013035861, 0.0059479754, 0.0058894763, 0.0, 0.25966725, 0.0064892126

实现方案2

from jieba import lcut
from gensim.similarities import SparseMatrixSimilarity
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
# 文本集和搜索词
texts = ['吃鸡这里所谓的吃鸡并不是真的吃鸡,也不是谐音词刺激的意思',
         '而是出自策略射击游戏《绝地求生:大逃杀》里的台词',
         '我吃鸡翅,你吃鸡腿']
keyword = '玩过吃鸡?今晚一起吃鸡'
# 1、将【文本集】生成【分词列表】
texts = [lcut(text) for text in texts]
# 2、基于文本集建立【词典】,并获得词典特征数
dictionary = Dictionary(texts)
num_features = len(dictionary.token2id)
# 3.1、基于词典,将【分词列表集】转换成【稀疏向量集】,称作【语料库】
corpus = [dictionary.doc2bow(text) for text in texts]
# 3.2、同理,用【词典】把【搜索词】也转换为【稀疏向量】
kw_vector = dictionary.doc2bow(lcut(keyword))
# 4、创建【TF-IDF模型】,传入【语料库】来训练
tfidf = TfidfModel(corpus)
# 5、用训练好的【TF-IDF模型】处理【被检索文本】和【搜索词】
tf_texts = tfidf[corpus]  # 此处将【语料库】用作【被检索文本】
tf_kw = tfidf[kw_vector]
# 6、相似度计算
sparse_matrix = SparseMatrixSimilarity(tf_texts, num_features)
similarities = sparse_matrix.get_similarities(tf_kw)
for e, s in enumerate(similarities, 1):
    print('kw 与 text%d 相似度为:%.2f' % (e, s))

kw 与 text1 相似度为:0.65
kw 与 text2 相似度为:0.00
kw 与 text3 相似度为:0.12

实现方案3

from gensim import corpora, models, similarities
import jieba
text1 = '无痛人流并非无痛'
text2 = '北方人流浪到南方'
texts = [text1, text2]
keyword = '无痛人流'
texts = [jieba.lcut(text) for text in texts]
dictionary = corpora.Dictionary(texts)
num_features = len(dictionary.token2id)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
new_vec = dictionary.doc2bow(jieba.lcut(keyword))
# 相似度计算
index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features)
print('\nTF-IDF模型的稀疏向量集:')
for i in tfidf[corpus]:
    print(i)
print('\nTF-IDF模型的keyword稀疏向量:')
print(tfidf[new_vec])
print('\n相似度计算:')
sim = index[tfidf[new_vec]]
for i in range(len(sim)):
    print('第', i+1, '句话的相似度为:', sim[i])

TF-IDF模型的稀疏向量集:
[(0, 0.4082482904638631), (1, 0.4082482904638631), (2, 0.8164965809277261)]
[(3, 0.4472135954999579), (4, 0.4472135954999579), (5, 0.4472135954999579), (6, 0.4472135954999579), (7, 0.4472135954999579)]

TF-IDF模型的keyword稀疏向量:
[(0, 0.7071067811865475), (2, 0.7071067811865475)]

相似度计算:
第 1 句话的相似度为: 0.8660254
第 2 句话的相似度为: 0.0

余弦相似度计算

import jieba
import jieba.analyse
 
def words2vec(words1=None, words2=None):
  v1 = []
  v2 = []
  tag1 = jieba.analyse.extract_tags(words1, withWeight=True)
  tag2 = jieba.analyse.extract_tags(words2, withWeight=True)
  tag_dict1 = {i[0]: i[1] for i in tag1}
  tag_dict2 = {i[0]: i[1] for i in tag2}
  merged_tag = set(tag_dict1.keys()) | set(tag_dict2.keys())
  for i in merged_tag:
    if i in tag_dict1:
      v1.append(tag_dict1[i])
    else:
      v1.append(0)
    if i in tag_dict2:
      v2.append(tag_dict2[i])
    else:
      v2.append(0)
  return v1, v2
 
 
def cosine_similarity(vector1, vector2):
  dot_product = 0.0
  normA = 0.0
  normB = 0.0
  for a, b in zip(vector1, vector2):
    dot_product += a * b
    normA += a ** 2
    normB += b ** 2
  if normA == 0.0 or normB == 0.0:
    return 0
  else:
    return round(dot_product / ((normA**0.5)*(normB**0.5)) * 100, 2)
   
def cosine(str1, str2):
  vec1, vec2 = words2vec(str1, str2)
  return cosine_similarity(vec1, vec2)
 
print(cosine('我喜欢哈哈哈', '我哈哈哈'))
  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

驭风少年君

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值