转载保存:
使用pytorch获取bert词向量_海蓝时见鲸_的博客-CSDN博客_获取bert词向量
pytorch-pretrained-bert简单使用_风吹草地现牛羊的马的博客-CSDN博客_pretrained pytorch
我的实现
源码:
from pytorch_pretrained_bert import BertModel, BertTokenizer
import numpy as np
# 加载bert的分词器
tokenizer = BertTokenizer.from_pretrained( # 载入词典
'/checkpoint_models/bert_checkpoint/bert-base-uncased-vocab.txt'
)
# 加载bert模型,这个路径文件夹下有bert_config.json配置文件和model.bin模型权重文件
bert = BertModel.from_pretrained( #
'/checkpoint_models/bert_checkpoint/bert-base-uncased/'
)
tensor_label = torch.zeros([args.batch_size, 768])
print('tensor_label ori', tensor_label)
for i in range(args.batch_size):
# 分词
# label_tokens = tokenizer.tokenize('dog dog dog')
label_tokens = tokenizer.tokenize(str(real_label[i])) # 将字符转str
# print("\".join(tokens))
# 获取词汇表索引
indexed_tokens = torch.tensor(
[tokenizer.convert_tokens_to_ids(label_tokens)])
# 计算词向量
with torch.no_grad(): # 仅测试,加速计算
label_emb = bert( # 输入有多少个单词就生成多少个
indexed_tokens, # 输出维度{[层数,batch号,单词号,特征层]}
output_all_encoded_layers=False)[0][0] # torch.Size([token_num, 768])
# 根据bert的做法,对于多个单词,特征求和
label_emb = label_emb.sum(dim=0) # [token_num, 768]->[768]
tensor_label[i] += label_emb
print('tensor_label then', tensor_label)
输出:
tensor_label ori tensor([[0., 0., 0., ..., 0., 0., 0