视频参考:
Bert中文英语日语文本相似度计算 代码实战分享_哔哩哔哩_bilibili
代码:
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from sklearn.metrics.pairwise import euclidean_distances # 欧氏距离
from sklearn.metrics.pairwise import cosine_similarity # 余弦距离
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer, 'bert-base-uncased-english')
# 模型 分词器 词汇表
tokenizer = tokenizer_class.from_pretrained(pretrained_weights) # 定义分词器
bert_model = model_class.from_pretrained(pretrained_weights) # 定义模型
def similar_count(vec1, vec2, model="cos"):
'''
计算距离
:param vec1: 句向量1
:param vec2: 句向量2
:param model: 用欧氏距离还是余弦距离
:return: 返回的是两个向量的距离得分
'''
if model == "eu":
return euclidean_distances([vec1, vec2])[0][1]
if model == "cos":
return cosine_similarity([vec1, vec2])[0][1]
def bert_vec(text):
# text = "但是你不喜欢我魍魉"
marked_text = "[CLS] " + text + " [SEP]"
print(marked_text)
tokenized_text = tokenizer.tokenize(marked_text)
# 使用这个分词器进行分词
print(tokenized_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # 把上边的文字转化为词汇表中对应的索引数字
print(indexed_tokens)
# for tup in zip(tokenized_text, indexed_tokens):
# print (tup)
# 魉 这个字bert词库中是没有的,所以呢 用umk来表示!
batch_tokenized = tokenizer.batch_encode_plus([text], padding=True, truncation=True, max_length=20)
# 最大长度是20 那么超过的就会被截断 不到20的 会将所有的句子补齐到 句子中的最大长度。
# 1. encode仅返回input_ids
# 2. encode_plus返回所有的编码信息,具体如下:
# ’input_ids:是单词在词典中的编码
# ‘token_type_ids’:区分两个句子的编码(上句全为0,下句全为1)
# ‘attention_mask’:指定对哪些词进行self-Attention操作
print(batch_tokenized)
input_ids = torch.tensor(batch_tokenized['input_ids'])
attention_mask = torch.tensor(batch_tokenized['attention_mask'])
bert_output = bert_model(input_ids, attention_mask=attention_mask)
print(bert_output[0].shape)
bert_cls_hidden_state = bert_output[0][:, 0, :]
print("48", bert_cls_hidden_state.shape) # 到这里
return np.array(bert_cls_hidden_state[0].detach().numpy())
vec1 = bert_vec('Pretrained model on English language using a masked language modeling (MLM) objective')
vec2 = bert_vec('BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion')
vec3 = bert_vec('hellow word ')
print(similar_count(vec1, vec2, model="cos"))
print(similar_count(vec1, vec3, model="cos"))
import torch
import numpy as np
from transformers import BertTokenizer, BertModel, BertForMaskedLM
from sklearn.metrics.pairwise import euclidean_distances # 欧氏距离
from sklearn.metrics.pairwise import cosine_similarity # 余弦距离
model_class, tokenizer_class, pretrained_weights = (BertModel, BertTokenizer, 'bert-base-uncased-english')
# 模型 分词器 词汇表
tokenizer = tokenizer_class.from_pretrained(pretrained_weights) # 定义分词器
bert_model = model_class.from_pretrained(pretrained_weights) # 定义模型
def similar_count(vec1, vec2, model="cos"):
'''
计算距离
:param vec1: 句向量1
:param vec2: 句向量2
:param model: 用欧氏距离还是余弦距离
:return: 返回的是两个向量的距离得分
'''
if model == "eu":
return euclidean_distances([vec1, vec2])[0][1]
if model == "cos":
return cosine_similarity([vec1, vec2])[0][1]
def bert_vec(text):
# text = "但是你不喜欢我魍魉"
marked_text = "[CLS] " + text + " [SEP]"
print(marked_text)
tokenized_text = tokenizer.tokenize(marked_text)
# 使用这个分词器进行分词
print(tokenized_text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # 把上边的文字转化为词汇表中对应的索引数字
print(indexed_tokens)
# for tup in zip(tokenized_text, indexed_tokens):
# print (tup)
# 魉 这个字bert词库中是没有的,所以呢 用umk来表示!
batch_tokenized = tokenizer.batch_encode_plus([text], padding=True, truncation=True, max_length=20)
# 最大长度是20 那么超过的就会被截断 不到20的 会将所有的句子补齐到 句子中的最大长度。
# 1. encode仅返回input_ids
# 2. encode_plus返回所有的编码信息,具体如下:
# ’input_ids:是单词在词典中的编码
# ‘token_type_ids’:区分两个句子的编码(上句全为0,下句全为1)
# ‘attention_mask’:指定对哪些词进行self-Attention操作
print(batch_tokenized)
input_ids = torch.tensor(batch_tokenized['input_ids'])
attention_mask = torch.tensor(batch_tokenized['attention_mask'])
bert_output = bert_model(input_ids, attention_mask=attention_mask)
print(bert_output[0].shape)
bert_cls_hidden_state = bert_output[0][:, 0, :]
print("48", bert_cls_hidden_state.shape) # 到这里
return np.array(bert_cls_hidden_state[0].detach().numpy())
vec1 = bert_vec('Pretrained model on English language using a masked language modeling (MLM) objective')
vec2 = bert_vec('BERT is a transformers model pretrained on a large corpus of English data in a self-supervised fashion')
vec3 = bert_vec('hellow word ')
print(similar_count(vec1, vec2, model="cos"))
print(similar_count(vec1, vec3, model="cos"))