from transformers import BertTokenizer, BertModel
import torch
tokenizer = BertTokenizer.from_pretrained("hfl/chinese-bert-wwm-ext")
model = BertModel.from_pretrained("hfl/chinese-bert-wwm-ext")
batch_sentence = [
"这是第一句话",
"这是第二句话",
"第三句,有点长了"
]
token_tensor = tokenizer(batch_sentence, padding=True, truncation=True, max_length=10, return_tensors='pt')
print(token_tensor)
print(token_tensor["input_ids"].shape)
output = model(token_tensor["input_ids"])
print(output[0].shape)
print(output[1].shape)
结果:
{'input_ids': tensor([[ 101, 6821, 3221, 5018, 671, 1368, 6413, 102, 0, 0],
[ 101, 6821, 3221, 5018, 753, 1368, 6413, 102, 0, 0],
[ 101, 5018, 676, 1368, 8024, 3300, 4157, 7270, 749, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0,