import torch
from transformers import BertTokenizer, BertModel
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Tokenize input
text = "[CLS] Nonrenewable resources are mined out of the ground [SEP]"
tokenized_text = tokenizer.tokenize(text)
# encode = tokenizer.encode(text, add_special_tokens=0) # 功能等价于函数convert_tokens_to_ids()
# print(encode)
print('tokenized_text: ', tokenized_text) # ['[CLS]', 'non', '##ren', '##ew', '##able', 'resources', 'are', 'mined', 'out', 'of', 'the', 'ground', '[SEP]']
print('type(tokenized_text): ', type(tokenized_text)) # <class 'list'>
print('len(tokenized_text): ', len(tokenized_text)) # 13
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print('indexed_tokens: ', indexed_tokens) # [101, 2512, 7389, 7974, 3085, 4219, 2024, 21846, 2041, 1997, 1996, 2598, 102]
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens]) # 要么加[],要么unsqueeze(0),来增加一个batch size = 1 维度,不然这个维度就是13,而不是[1, 13]
print('tokens_tensor: ', tokens_tensor) # tensor([[ 101, 2512, 7389, 7974, 3085, 4219, 2024, 21846, 2041, 1997, 1996, 2598, 102]])
print('tokens_tensor.shape: ', tokens_tensor.shape) # torch.Size([1, 13])
print('=====================================================')
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')
# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()
# Predict hidden states features for each layer
with torch.no_grad():
# See the models docstrings for the detail of the inputs
outputs = model(tokens_tensor)
# Transformers models always output tuples.
# See the models docstrings for the detail of all the outputs
# In our case, the first element is the hidden state of the last layer of the Bert model
print('len(outputs): ', len(outputs)) # 2
print('type(outputs): ', type(outputs)) # <class 'tuple'>
# print('outputs: ', outputs)
encoded_layers = outputs[0] # 词向量 [1, 13, 768]
sentence_vec = outputs[1] # 句向量 [1, 768]
print('encoded_layers.shape: ', encoded_layers.shape) # torch.Size([1, 13, 768]), 1个batchsize,13个词token,768个weights
print(encoded_layers)
print(encoded_layers[0][0].shape) # torch.Size([768])
# print('encoded_layers: ', encoded_layers)
print('sentence_vec.shape: ', sentence_vec.shape) # torch.Size([1, 768]),句向量,1个句子
# print('sentence_vec: ', sentence_vec)
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
# assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
pytroch版本bert使用笔记
最新推荐文章于 2023-10-25 01:09:07 发布