from transformers import BertTokenizer
#uncased是不支持小写
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
encoded_dict = tokenizer.encode_plus(
sent, # Sentence to encode.
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = 100, # Pad & truncate all sentences.
padding = 'max_length', #补全操作
truncation = True, #截断操作
return_attention_mask = True, # Construct attn. masks.
return_tensors = 'pt', # Return pytorch tensors.
)
tokenizer id转token 和token转id
tokenizer = AutoTokenizer.from_pretrained("bert-base-chinese")
tokenizer