HuggingFace NLP tutorial
https://huggingface.co/learn/nlp-course/
Tokenizer
from transformers import BertTokenizer
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)
str = "What a day for a daydream!"
tokens_dict = tokenzier(str)
tokens = tokenizer.tokenize(str)
tokenizer.convert_tokens_to_ids(tokens)
tokenizer.convert_ids_to_tokesn(tokens_dict["input_ids"])
tokenizer.decode(tokens_dict["input_ids"])
tokenizer.get_vocab
tokenizer.unk_token
tokenizer.unk_token_id