搜了半天的Bert文本对齐方法
发现还没Huggingface的transformers里的方法好用
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
sequence_a = "This is a short sequence."
sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)