知识融合中几种计算文本相似度的方法(代码）

阿拉辉

已于 2023-01-06 10:19:50 修改

阅读量1.3k

点赞数 2

分类专栏： NLP 知识图谱文章标签： nlp

于 2021-04-13 10:59:59 首次发布

本文链接：https://blog.csdn.net/weixin_38492159/article/details/115654971

版权

NLP 同时被 2 个专栏收录

6 篇文章 0 订阅

订阅专栏

知识图谱

3 篇文章 0 订阅

订阅专栏

本文探讨了余弦相似度在文本匹配中的应用，通过计算刘德华和不同句子之间的相似度；接着介绍了杰卡德系数用于衡量词汇交集，展示了两个文本片段的相关程度；最后展示了BERT模型在句子向量表示中的作用，以及编辑距离和BERT-as-service在信息检索中的角色。

摘要由CSDN通过智能技术生成

1.余弦相似度

import numpy as np
import jieba
#读取停用词
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords


# 加载停用词
stopwords = stopwordslist("cn_stopwords.txt")


def cosine_similarity(sentence1: str, sentence2: str) -> float:
    """
    :param sentence1: s
    :param sentence2:
    :return: 两句文本的相识度
    """
    seg1 = [word for word in jieba.cut(sentence1) if word not in stopwords]
    seg2 = [word for word in jieba.cut(sentence2) if word not in stopwords]
    word_list = list(set([word for word in seg1 + seg2]))#建立词库
    word_count_vec_1 = []
    word_count_vec_2 = []
    for word in word_list:
        word_count_vec_1.append(seg1.count(word))#文本1统计在词典里出现词的次数
        word_count_vec_2.append(seg2.count(word))#文本2统计在词典里出现词的次数


    vec_1 = np.array(word_count_vec_1)
    vec_2 = np.array(word_count_vec_2)
    #余弦公式


    num = vec_1.dot(vec_2.T)
    denom = np.linalg.norm(vec_1) * np.linalg.norm(vec_2)
    cos = num / denom
    sim = 0.5 + 0.5 * cos


    return sim

str1="刘德华1961年9月27日出生于中国香港，籍贯广东新会，华语影视男演员、歌手、制片人、作词人。"
str2="华仔参加了安徽国剧盛典颁奖晚会"
str3= "华仔1961年9月27日出生于中国香港，籍贯广东新会，华语影视男演员、歌手、制片人、作词人。1981年出演电影处女作《彩云曲》"
sim1=cosine_similarity(str1,str2)
sim2=cosine_similarity(str1,str3)
print("sim1 ：",sim1)
print("sim2:",sim2)

结果

sim1 ： 0.5
sim2: 0.908248290463863

2.杰卡德

import jieba

def Jaccrad(model, reference):  # terms_reference为源句子，terms_model为候选句子
    terms_reference = jieba.cut(reference)  # 默认精准模式
    terms_model = jieba.cut(model)
    grams_reference = set(terms_reference)  # 去重；如果不需要就改为list
    grams_model = set(terms_model)
    temp = 0
    for i in grams_reference:
        if i in grams_model:
            temp = temp + 1
    fenmu = len(grams_model) + len(grams_reference) - temp  # 并集
    jaccard_coefficient = float(temp / fenmu)  # 交集
    return jaccard_coefficient

a = "苹果（Apple Inc. ）是美国一家高科技公司。由史蒂夫·乔布斯、斯蒂夫·盖瑞·沃兹尼亚克和罗纳德·杰拉尔德·韦恩（Ron Wayne）等人于1976年4月1日创立"
b = "苹果公司创立之初，主要开发和销售的个人电脑，截至2014年致力于设计、开发和销售消费电子、计算机软件、在线服务和个人计算机"
c = "红富士苹果吃起来很香甜"
jaccard_1 = Jaccrad(a,b)  
jaccard_2 = Jaccrad(a,c)
print(jaccard_1)
print(jaccard_2)

结果

0.06896551724137931
0.022727272727272728

3.TFIDF

import jieba
from gensim import corpora,models,similarities
#读取停用词
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
    return stopwords

# 加载停用词
stopwords = stopwordslist("cn_stopwords.txt")

str1="重庆是一个好地方"
str2="重庆好吃的在哪里"
str3= "重庆是好地方"

def gensimSimilarities(str1,str2):
    all_doc = []
    all_doc.append(str1)
    all_doc.append(str2)
    all_doc.append(str3)
    # 以下对目标文档进行分词，并且保存在列表all_doc_list中
    all_doc_list = []
    for doc in all_doc:
        doc_list = [word for word in jieba.cut(doc) if word not in stopwords]
        all_doc_list.append(doc_list)
    # 首先用dictionary方法获取词袋（bag-of-words)
    dictionary = corpora.Dictionary(all_doc_list)
    # 以下使用doc2bow制作语料库
    corpus = [dictionary.doc2bow(doc) for doc in all_doc_list]


    # 使用TF-IDF模型对语料库建模
    tfidf = models.TfidfModel(corpus)
    index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=len(dictionary.keys()))
    sim = index[tfidf[corpus]]
    return sim

sim=gensimSimilarities(str1,str2)
print(sim)

结果

[[1.         0.         0.34624156]
[0.         1.         0.        ]
[0.34624156 0.         1.        ]]

4.编辑距离

import Levenshtein
a = "苹果（Apple Inc. ）是美国一家高科技公司。由史蒂夫·乔布斯、斯蒂夫·盖瑞·沃兹尼亚克和罗纳德·杰拉尔德·韦恩（Ron Wayne）等人于1976年4月1日创立"
b = "苹果公司创立之初，主要开发和销售的个人电脑，截至2014年致力于设计、开发和销售消费电子、计算机软件、在线服务和个人计算机"
c = "红富士苹果吃起来很香甜"
Levenshtein.distance(a, b)
print(Levenshtein.distance(b,a))
print(Levenshtein.jaro(b, a))
print(Levenshtein.jaro_winkler(b,a))
print(Levenshtein.jaro_winkler(c,a)) #Jaro_Winkler编辑距离

结果

79
0.3993624772313297
0.5194899817850638
0.4018759018759019

5.bert-as-service

import numpy as np
from bert_serving.client import BertClient
bc = BertClient(ip='localhost')
topk = 3

sentences = ['苹果（Apple Inc. ）是美国一家高科技公司。由史蒂夫·乔布斯、斯蒂夫·盖瑞·沃兹尼亚克和罗纳德·杰拉尔德·韦恩（Ron Wayne）等人于1976年4月1日创立',
             '苹果公司创立之初，主要开发和销售的个人电脑，截至2014年致力于设计、开发和销售消费电子、计算机软件、在线服务和个人计算机',
             '红富士苹果吃起来很香甜',
             '刘德华1961年9月27日出生于中国香港，籍贯广东新会，华语影视男演员、歌手、制片人、作词人',
             '华仔参加了安徽国剧盛典颁奖晚会',
             '华仔1961年9月27日出生于中国香港，籍贯广东新会，华语影视男演员、歌手、制片人、作词人。1981年出演电影处女作《彩云曲》'
             ]


sentences_vec = bc.encode(sentences)
#print(sentences_vec)
test_vec = bc.encode(['华仔出生于中国香港，主演电影《拆弹专家》'])
#print(test_vec)
score = (np.sum(test_vec * sentences_vec, axis=1) /  np.linalg.norm(sentences_vec, axis=1))/20
#score =np.linalg.norm(sentences_vec, axis=1) /np.sum(test_vec *  sentences_vec, axis=1)
topk_idx = np.argsort(score)[::-1][:topk]
for idx in topk_idx:
    print('> %s\t%s' % (score[idx], sentences[idx]))