import torch
from transformers import AutoTokenizer
对单句英文进行基本操作(想用中文分词可以自己在github上找哈工大的model)
# 用bert-base-uncased 分词 + convert to id + encoder(prepare for model)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("albert-base-v1")
tokens = tokenizer.tokenize("Sometime it last in love, sometime it hurts instead.") # 分词
input_ids = tokenizer.convert_tokens_to_ids(tokens) # 将tokens转化成id
final_inputs = tokenizer.prepare_for_model(input_ids) # 将上一步token的id转化为模型训练所需id(transformer训练还需要attention mask matrix)
解码部分
# 用bert-base-uncased解码 ———> [CLS] should i give up, or should i just keep chasing pavement, even if it leads nowhere. [SEP]
inputs = tokenizer("Should i give up, or should i just keep chasing pavement, even if it leads nowhere.")
sentence = tokenizer.decode(inputs["input_ids"]) # 解码后的句子;tokenizer之后包含input_ids、token_ids、attention_mask
# 用roberta-base解码 ———> <s>Should i give up, or should i just keep chasing pavement, even if it leads nowhere.</s>
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
inputs = tokenizer("Should i give up, or should i just keep chasing pavement, even if it leads nowhere.")
sentence2 = tokenizer.decode(inputs["input_ids"])