1.安装transformers
pip install transformers
2.加载预训练词典和分词方法
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(
pretrained_model_name_or_path='bert-base-chinese',
cache_dir=None,
force_download=False,
)
tokenizer = BertTokenizer.from_pretrained('./bert-base-chinese/BertTokenizer/vocab.txt')
dictionary = tokenizer.get_vocab()
len(dictionary)
3.句子编码
3.1句子编码
sents = [
'自然语言处理',
'第三方工具包',
'开发的应用在青少年中颇受欢迎'
]
out = tokenizer.encode(
text = sents[0],
text_pair = sents[1],
truncation=True,
padding='max_length',
add_special_tokens=True,
max_length=30,
return_tensors=None,
)
print(out)
tokenizer.decode(out)
3.2增强编码函数
out = tokenizer.encode_plus(
text = sents[0],
text_pair = sents[1],
truncation = True,
padding = 'max_length',
max_length = 30,
add_special_tokens = True,
return_tensors = None,
return_token_typ