from transformers.tokenization_bert import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")print("词典大小:",tokenizer.vocab_size)
text ="the game has gone!unaffable I have a new GPU!"
tokens = tokenizer.tokenize(text)print("英文分词来一个:",tokens)
text ="我爱北京天安门,吢吣"
tokens = tokenizer.tokenize(text)print("中文分词来一个:",tokens)
input_ids = tokenizer.convert_tokens_to_ids(tokens)print("id-token转换:",input_ids)
sen_code = tokenizer.encode_plus("i like you much","but not him")print("多句子encode:",sen_code)print("decode:",tokenizer.decode(sen_code['input_ids']))
词典大小:30522
英文分词来一个: ['the','game','has','gone','!','una','##ffa','##ble','i','have','a','new','gp','##u','!']
中文分词来一个: ['我','[UNK]','北','京','天','安','[UNK]',',','[UNK]','[UNK]']id-token转换:[1855,100,1781,1755,1811,1820,100,1989,100,100]
多句子encode: {'input_ids':[101,1045,2066,2017,2172,102,2021,2025,2032,102],'token_type_ids':[0,0,0,0,0,0,1,1,1,1],'attention_mask':[1,1,1,1,1,1,1,1,1,1]}
decode: [CLS] i like you much [SEP] but not him [SEP]