pytroch版本bert使用笔记

import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize input
text = "[CLS] Nonrenewable resources are mined out of the ground [SEP]"
tokenized_text = tokenizer.tokenize(text)

# encode = tokenizer.encode(text, add_special_tokens=0)   # 功能等价于函数convert_tokens_to_ids()
# print(encode)
print('tokenized_text: ', tokenized_text)   # ['[CLS]', 'non', '##ren', '##ew', '##able', 'resources', 'are', 'mined', 'out', 'of', 'the', 'ground', '[SEP]']
print('type(tokenized_text): ', type(tokenized_text))   # <class 'list'>
print('len(tokenized_text): ', len(tokenized_text)) # 13

# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print('indexed_tokens: ', indexed_tokens)   # [101, 2512, 7389, 7974, 3085, 4219, 2024, 21846, 2041, 1997, 1996, 2598, 102]

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])  # 要么加[],要么unsqueeze(0),来增加一个batch size = 1 维度,不然这个维度就是13,而不是[1, 13]
print('tokens_tensor: ', tokens_tensor) # tensor([[  101,  2512,  7389,  7974,  3085,  4219,  2024, 21846,  2041,  1997, 1996,  2598,   102]])
print('tokens_tensor.shape: ', tokens_tensor.shape) # torch.Size([1, 13])
print('=====================================================')

# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model in evaluation mode to deactivate the DropOut modules
# This is IMPORTANT to have reproducible results during evaluation!
model.eval()

# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor)
    # Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    print('len(outputs): ', len(outputs))   # 2
    print('type(outputs): ', type(outputs)) # <class 'tuple'>
    # print('outputs: ', outputs)
    encoded_layers = outputs[0] # 词向量 [1, 13, 768]
    sentence_vec = outputs[1]   # 句向量 [1, 768]
    print('encoded_layers.shape: ', encoded_layers.shape)   # torch.Size([1, 13, 768]), 1个batchsize,13个词token,768个weights
    print(encoded_layers)
    print(encoded_layers[0][0].shape)      # torch.Size([768])

    # print('encoded_layers: ', encoded_layers)
    print('sentence_vec.shape: ', sentence_vec.shape)   # torch.Size([1, 768]),句向量,1个句子
    # print('sentence_vec: ', sentence_vec)
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
# assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值