BPE
是使用最广泛的sub-word tokenization算法之一。尽管贪婪,但它具有良好的性能,并被作为机器翻译等主流NLP任务的首选tokenize方法之一。
1. Byte-Pair Encoding Tokenizer Training
import pandas as pd
# Import gc, a library for controlling the garbage collector
import gc
# Import various classes and functions from the tokenizers library, which is used for creating and using custom tokenizers
from tokenizers import (
decoders,
models,
normalizers,
pre_tokenizers,
processors,
trainers,
Tokenizer,
)
# Import PreTrainedTokenizerFast, a class for using fast tokenizers from the transformers library
from transformers import PreTrainedTokenizerFast
# Import TfidfVectorizer, a class for transforming text into TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
# Import tqdm, a library for displaying progress bars
from tqdm.auto import tqdm
# Import Dataset, a class for working with datasets in a standardized way
from datasets import Dataset
# Set the LOWERCASE flag to False
LOWERCASE = False
# Set the VOCAB_SIZE to 10000000.
# This means that the maximum number of words in the vocabulary will be 10 million.
VOCAB_SIZE = 10000000
test = pd.read_csv('data/test_text.csv'