一、背景
- 最近在研究文本相似度,利用Bert去实现。
- 如果是在通用领域内进行文本相似度计算的话,就无需对Bert中文模型进行预训练,如果在特定领域内,就需要提前用大量的语料进行对google原版的中文模型进行预训练。
- bert用来提取句向量,然后利用余弦距离去计算相似度。
二、具体实现
- 利用苏神的bert4keras去构建网络模型,简单而又方便。
贴一下苏神的bert4keras: https://github.com/bojone/bert4keras
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity
config_path = '../model/finetune_model/bert_config.json'
checkpoint_path = r'../model/finetune_model_new/model.ckpt'
dict_path = '../model/finetune_model/vocab.txt'
def similar_count(vec1, vec2, model="cos"):
'''
计算距离
:param vec1: 句向量1
:param vec2: 句向量2
:param model: 用欧氏距离还是余弦距离
:return: 返回的是两个向量的距离得分
'''
if model == "eu":
return euclidean_distances([vec1, vec2])[0][1]
if model == "cos":
return cosine_similarity([vec1, vec2])[0][1]
def main():
model = build_transformer_model(
config_path=config_path,
checkpoint_path=checkpoint_path,
with_pool=True,
return_keras_model=True,
model="bert"
)
tokenizer = Tokenizer(dict_path)
stand_sent1 = "--------------------------------------------"
stand_sent2 = "--------------------------------------------"
token_ids1, segment_ids1 = tokenizer.encode(stand_sent1 , maxlen=128)
token_ids2, segment_ids2 = tokenizer.encode(stand_sent2 , maxlen=128)
sentence_vec1 = model.predict([np.array([token_ids1]), np.array([segment_ids1])])[0]
sentence_vec2 = model.predict([np.array([token_ids2]), np.array([segment_ids2])])[0]
score = similar_count(sentence_vec1, sentence_vec2)
print(score)
if __name__ == '__main__':
main()