BERTScore——一种衡量两段文本相似度的方法

论文原文:原文链接

github链接:github链接

BertScore是一种用于衡量文本相似度的指标,它基于BERT模型,通过计算两个句子在BERT模型中的embedding编码之间的cosine相似度,来评估它们的相似度。

BertScore的基本理念是将两个句子分别输入到预先训练好的BERT模型中,获取句子的语义表示,然后比较这些表示的相似程度。与其他基于词向量的相似度指标不同,BertScore考虑了上下文信息和语义信息,因此能够更准确地衡量句子之间的相似度,特别是对于语义相似但表达不同的句子,BertScore更能捕捉到它们之间的相似性。

1.安装:

pip install bert-score

2.使用

通过model_type来指定需要的模型,会自动从huggingface下载,支持的模型有:

model2layers = {
    "bert-base-uncased": 9,  # 0.6925188074454226
    "bert-large-uncased": 18,  # 0.7210358126642836
    "bert-base-cased-finetuned-mrpc": 9,  # 0.6721947475618048
    "bert-base-multilingual-cased": 9,  # 0.6680687802637132
    "bert-base-chinese": 8,
    "roberta-base": 10,  # 0.706288719158983
    "roberta-large": 17,  # 0.7385974720781534
    "roberta-large-mnli": 19,  # 0.7535618640417984
    "roberta-base-openai-detector": 7,  # 0.7048158349432633
    "roberta-large-openai-detector": 15,  # 0.7462770207355116
    "xlnet-base-cased": 5,  # 0.6630103662114238
    "xlnet-large-cased": 7,  # 0.6598800720297179
    "xlm-mlm-en-2048": 6,  # 0.651262570131464
    "xlm-mlm-100-1280": 10,  # 0.6475166424401905
    # "scibert-scivocab-uncased": 8,  # 0.6590354319927313
    # "scibert-scivocab-cased": 9,  # 0.6536375053937445
    # "scibert-basevocab-uncased": 9,  # 0.6748944832703548
    # "scibert-basevocab-cased": 9,  # 0.6524624150542374
    "allenai/scibert_scivocab_uncased": 8,  # 0.6590354393124127
    "allenai/scibert_scivocab_cased": 9,  # 0.6536374902465466
    "nfliu/scibert_basevocab_uncased": 9,  # 0.6748945076082333
    "distilroberta-base": 5,  # 0.6797558139322964
    "distilbert-base-uncased": 5,  # 0.6756659152782033
    "distilbert-base-uncased-distilled-squad": 4,  # 0.6718318036382493
    "distilbert-base-multilingual-cased": 5,  # 0.6178131050889238
    "albert-base-v1": 10,  # 0.654237567249745
    "albert-large-v1": 17,  # 0.6755890754323239
    "albert-xlarge-v1": 16,  # 0.7031844211905911
    "albert-xxlarge-v1": 8,  # 0.7508642218461096
    "albert-base-v2": 9,  # 0.6682455591837927
    "albert-large-v2": 14,  # 0.7008537594374035
    "albert-xlarge-v2": 13,  # 0.7317228357869254
    "albert-xxlarge-v2": 8,  # 0.7505160257184014
    "xlm-roberta-base": 9,  # 0.6506799445871697
    "xlm-roberta-large": 17,  # 0.6941551437476826
    "google/electra-small-generator": 9,  # 0.6659421842117754
    "google/electra-small-discriminator": 11,  # 0.6534639151385759
    "google/electra-base-generator": 10,  # 0.6730033453857188
    "google/electra-base-discriminator": 9,  # 0.7032089590812965
    "google/electra-large-generator": 18,  # 0.6813370013104459
    "google/electra-large-discriminator": 14,  # 0.6896675824733477
    "google/bert_uncased_L-2_H-128_A-2": 1,  # 0.5887998733228855
    "google/bert_uncased_L-2_H-256_A-4": 1,  # 0.6114863547661203
    "google/bert_uncased_L-2_H-512_A-8": 1,  # 0.6177345529192847
    "google/bert_uncased_L-2_H-768_A-12": 2,  # 0.6191261237956839
    "google/bert_uncased_L-4_H-128_A-2": 3,  # 0.6076202863798991
    "google/bert_uncased_L-4_H-256_A-4": 3,  # 0.6205239036810148
    "google/bert_uncased_L-4_H-512_A-8": 3,  # 0.6375351621856903
    "google/bert_uncased_L-4_H-768_A-12": 3,  # 0.6561849979644787
    "google/bert_uncased_L-6_H-128_A-2": 5,  # 0.6200458425360283
    "google/bert_uncased_L-6_H-256_A-4": 5,  # 0.6277501629539081
    "google/bert_uncased_L-6_H-512_A-8": 5,  # 0.641952305130849
    "google/bert_uncased_L-6_H-768_A-12": 5,  # 0.6762186226247106
    "google/bert_uncased_L-8_H-128_A-2": 7,  # 0.6186876506711779
    "google/bert_uncased_L-8_H-256_A-4": 7,  # 0.6447993208267708
    "google/bert_uncased_L-8_H-512_A-8": 6,  # 0.6489729408169956
    "google/bert_uncased_L-8_H-768_A-12": 7,  # 0.6705203359541737
    "google/bert_uncased_L-10_H-128_A-2": 8,  # 0.6126762064125278
    "google/bert_uncased_L-10_H-256_A-4": 8,  # 0.6376350032576573
    "google/bert_uncased_L-10_H-512_A-8": 9,  # 0.6579006292799915
    "google/bert_uncased_L-10_H-768_A-12": 8,  # 0.6861146692220176
    "google/bert_uncased_L-12_H-128_A-2": 10,  # 0.6184105693383591
    "google/bert_uncased_L-12_H-256_A-4": 11,  # 0.6374004994430261
    "google/bert_uncased_L-12_H-512_A-8": 10,  # 0.65880012149526
    "google/bert_uncased_L-12_H-768_A-12": 9,  # 0.675911357700092
    "amazon/bort": 0,  # 0.41927911053036643
    "facebook/bart-base": 6,  # 0.7122259132414092
    "facebook/bart-large": 10,  # 0.7448671872459683
    "facebook/bart-large-cnn": 10,  # 0.7393148105835096
    "facebook/bart-large-mnli": 11,  # 0.7531665445691358
    "facebook/bart-large-xsum": 9,  # 0.7496408866539556
    "t5-small": 6,  # 0.6813843919496912
    "t5-base": 11,  # 0.7096044814981418
    "t5-large": 23,  # 0.7244153820191929
    "vinai/bertweet-base": 9,  # 0.6529471006118857
    "microsoft/deberta-base": 9,  # 0.7088459455930344
    "microsoft/deberta-base-mnli": 9,  # 0.7395257063907247
    "microsoft/deberta-large": 16,  # 0.7511806792052013
    "microsoft/deberta-large-mnli": 18,  # 0.7736263649679905
    "microsoft/deberta-xlarge": 18,  # 0.7568670944373346
    "microsoft/deberta-xlarge-mnli": 40,  # 0.7780600929333213
    "YituTech/conv-bert-base": 10,  # 0.7058253551080789
    "YituTech/conv-bert-small": 10,  # 0.6544473011107349
    "YituTech/conv-bert-medium-small": 9,  # 0.6590097075123257
    "microsoft/mpnet-base": 8,  # 0.724976539498804
    "squeezebert/squeezebert-uncased": 9,  # 0.6543868703018726
    "squeezebert/squeezebert-mnli": 9,  # 0.6654799051284791
    "squeezebert/squeezebert-mnli-headless": 9,  # 0.6654799051284791
    "tuner007/pegasus_paraphrase": 15,  # 0.7188349436772694
    "google/pegasus-large": 8,  # 0.63960462272448
    "google/pegasus-xsum": 11,  # 0.6836878575233349
    "sshleifer/tiny-mbart": 2,  # 0.028246072231946733
    "facebook/mbart-large-cc25": 12,  # 0.6582922975802958
    "facebook/mbart-large-50": 12,  # 0.6464972230103133
    "facebook/mbart-large-en-ro": 12,  # 0.6791285137459857
    "facebook/mbart-large-50-many-to-many-mmt": 12,  # 0.6904136529270892
    "facebook/mbart-large-50-one-to-many-mmt": 12,  # 0.6847906439540236
    "allenai/led-base-16384": 6,  # 0.7122259170564179
    "facebook/blenderbot_small-90M": 7,  # 0.6489176335400088
    "facebook/blenderbot-400M-distill": 2,  # 0.5874774070540008
    "microsoft/prophetnet-large-uncased": 4,  # 0.586496184234925
    "microsoft/prophetnet-large-uncased-cnndm": 7,  # 0.6478379437729287
    "SpanBERT/spanbert-base-cased": 8,  # 0.6824006863686848
    "SpanBERT/spanbert-large-cased": 17,  # 0.705352690855603
    "microsoft/xprophetnet-large-wiki100-cased": 7,  # 0.5852499775879524
    "ProsusAI/finbert": 10,  # 0.6923213940752796
    "Vamsi/T5_Paraphrase_Paws": 12,  # 0.6941611753807352
    "ramsrigouthamg/t5_paraphraser": 11,  # 0.7200917597031539
    "microsoft/deberta-v2-xlarge": 10,  # 0.7393675784473045
    "microsoft/deberta-v2-xlarge-mnli": 17,  # 0.7620620803716714
    "microsoft/deberta-v2-xxlarge": 21,  # 0.7520547670281869
    "microsoft/deberta-v2-xxlarge-mnli": 22,  # 0.7742603457742682
    "allenai/longformer-base-4096": 7,  # 0.7089559593129316
    "allenai/longformer-large-4096": 14,  # 0.732408493548181
    "allenai/longformer-large-4096-finetuned-triviaqa": 14,  # 0.7365882744744722
    "zhiheng-huang/bert-base-uncased-embedding-relative-key": 4,  # 0.5995636595368777
    "zhiheng-huang/bert-base-uncased-embedding-relative-key-query": 7,  # 0.6303599452145718
    "zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query": 19,  # 0.6896878492850327
    "google/mt5-small": 8,  # 0.6401166527273479
    "google/mt5-base": 11,  # 0.5663956536597241
    "google/mt5-large": 19,  # 0.6430931371732798
    "google/mt5-xl": 24,  # 0.6707200963021145
    "google/bigbird-roberta-base": 10,  # 0.6695606423502717
    "google/bigbird-roberta-large": 14,  # 0.6755874042374509
    "google/bigbird-base-trivia-itc": 8,  # 0.6930725491629892
    "princeton-nlp/unsup-simcse-bert-base-uncased": 10,  # 0.6703066531921142
    "princeton-nlp/unsup-simcse-bert-large-uncased": 18,  # 0.6958302800755326
    "princeton-nlp/unsup-simcse-roberta-base": 8,  # 0.6436615893535319
    "princeton-nlp/unsup-simcse-roberta-large": 13,  # 0.6812864385585965
    "princeton-nlp/sup-simcse-bert-base-uncased": 10,  # 0.7068074935240984
    "princeton-nlp/sup-simcse-bert-large-uncased": 18,  # 0.7111049471332378
    "princeton-nlp/sup-simcse-roberta-base": 10,  # 0.7253123806661946
    "princeton-nlp/sup-simcse-roberta-large": 16,  # 0.7497820277237173
    "dbmdz/bert-base-turkish-cased": 10,  # WMT18 seg en-tr 0.5522827687776142
    "dbmdz/distilbert-base-turkish-cased": 4,  # WMT18 seg en-tr 0.4742268041237113
    "google/byt5-small": 1,  # 0.5100025975052146
    "google/byt5-base": 17,  # 0.5810347173565313
    "google/byt5-large": 30,  # 0.6151895697554877
    "microsoft/deberta-v3-xsmall": 10,  # 0.6941803815412021
    "microsoft/deberta-v3-small": 4,  # 0.6651551203179679
    "microsoft/deberta-v3-base": 9,  # 0.7261586651018335
    "microsoft/mdeberta-v3-base": 10,  # 0.6778713684091584
    "microsoft/deberta-v3-large": 12,  # 0.6927693082293821
    "khalidalt/DeBERTa-v3-large-mnli": 18,  # 0.7428756686018376
}

允许一个句子和一个句子比较,或者一对多,多对多,举例来说:

# 一对一
cand=["I have an apple."]
ref=["I have a pen."]
P, R, F1 = bert_score.score(cand,ref, lang="en", verbose=True,model_type='bert-large-uncased') #tensor([0.8176]) tensor([0.8176]) tensor([0.8176])
# 一对多
# 那个数量多的要再套一层列表
# 注意 这种写法会返回三个里面分数最高的,而不是把三个都返回
cand=["I have an apple."]
ref=[["I have a pen.","I have a doll"]]
P, R, F1 = bert_score.score(cand,ref, lang="en", verbose=True,model_type='bert-large-uncased') #tensor([0.8176]) tensor([0.8176]) tensor([0.8176])
# 多对多
cand=["I have an apple.","I am Lucky."]
ref=["I have a pen.","I am Lucy."]
P, R, F1 = bert_score.score(cand,ref, lang="en", verbose=True,model_type='bert-large-uncased') #tensor([0.8176, 0.6489]) tensor([0.8176, 0.6489]) tensor([0.8176, 0.6489])

点关注!!

  • 4
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 7
    评论
可以使用Python中的自然语言处理库NLTK和相似度计算库gensim来实现文本相似度检测程序。 首先,需要安装NLTK和gensim库: ``` pip install nltk pip install gensim ``` 然后,可以使用NLTK库中的Tokenize和Stopwords模块对文本进行预处理,去除停用词和标点符号,并将文本转换为单词列表。 接着,可以使用gensim库中的Doc2Vec模块将文本转换为向量表示,然后计算两段文本向量之间的相似度。 下面是一个简单的示例程序: ```python import nltk from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from gensim.models.doc2vec import Doc2Vec, TaggedDocument # 加载停用词 nltk.download('stopwords') stop_words = set(stopwords.words('english')) # 预处理文本 def preprocess(text): # 分词 tokens = word_tokenize(text.lower()) # 去除停用词和标点符号 tokens = [token for token in tokens if token not in stop_words and token.isalnum()] return tokens # 计算文本相似度 def similarity(text1, text2): # 预处理文本 tokens1 = preprocess(text1) tokens2 = preprocess(text2) # 将文本转换为TaggedDocument对象 doc1 = TaggedDocument(tokens1, [0]) doc2 = TaggedDocument(tokens2, [1]) # 训练Doc2Vec模型 model = Doc2Vec([doc1, doc2], vector_size=50, min_count=1, epochs=10) # 计算文本相似度 sim = model.docvecs.similarity(0, 1) return sim # 测试 text1 = "The quick brown fox jumps over the lazy dog." text2 = "The quick brown fox jumps over the lazy cat." sim = similarity(text1, text2) print("文本相似度:", sim) ``` 输出结果: ``` 文本相似度: 0.99999994 ``` 可以看到,两段文本非常相似,相似度接近1。
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值