1. tokenizer.encode() 方法 与 tokenizer.tokenize() 之间的区别:
(1) tokenizer.encode() 返回其在字典中的id
(2) tokenizer.tokenize() 返回 token
def bert_():
model_name = 'bert-base-chinese'
MODEL_PATH = 'F:/models/bert-base-chinese/'
# a.通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
# b. 导入配置文件
model_config = BertConfig.from_pretrained(model_name)
# 修改配置
model_config.output_hidden_states = True
model_config.output_attentions = True
# 通过配置和路径导入模型
bert_model = BertModel.from_pretrained(MODEL_PATH, config=model_config)
# tokenizer.encode()
sen_code_encode = tokenizer.encode("自然语")
print("sen_code_encode",sen_code)
# tokenizer.tokenize
sen_code_tokenizer = tokenizer.tokenize("自然语")
print("sen_code_tokenizer", sen_code0)
if __name__ == '__main__':
bert_()