1、安装tokenizers
pip install tokenizers
2、特殊字符的文件special.txt
[PAD]
[unused1]
[unused2]
[unused3]
[unused4]
[unused5]
[unused6]
[unused7]
[unused8]
[unused9]
[unused10]
[unused11]
[unused12]
[unused13]
[unused14]
[unused15]
[unused16]
[unused17]
[unused18]
[unused19]
[unused20]
[unused21]
[unused22]
[unused23]
[unused24]
[unused25]
[unused26]
[unused27]
[unused28]
[unused29]
[unused30]
[unused31]
[unused32]
[unused33]
[unused34]
[unused35]
[unused36]
[unused37]
[unused38]
[unused39]
[unused40]
[unused41]
[unused42]
[unused43]
[unused44]
[unused45]
[unused46]
[unused47]
[unused48]
[unused49]
[unused50]
[unused51]
[unused52]
[unused53]
[unused54]
[unused55]
[unused56]
[unused57]
[unused58]
[unused59]
[unused60]
[unused61]
[unused62]
[unused63]
[unused64]
[unused65]
[unused66]
[unused67]
[unused68]
[unused69]
[unused70]
[unused71]
[unused72]
[unused73]
[unused74]
[unused75]
[unused76]
[unused77]
[unused78]
[unused79]
[unused80]
[unused81]
[unused82]
[unused83]
[unused84]
[unused85]
[unused86]
[unused87]
[unused88]
[unused89]
[unused90]
[unused91]
[unused92]
[unused93]
[unused94]
[unused95]
[unused96]
[unused97]
[unused98]
[unused99]
[UNK]
[CLS]
[SEP]
[MASK]
3、使用BertWordPieceTorkenizer进行分词
from tokenizers import BertWordPieceTokenizer
f=open('special.txt','r',encoding='utf-8')#特殊字符的文件
special_data=f.readlines()#列表
special_list=[]
for special_single in special_data:
special_list.append(special_single[:-1])#-1去掉后面换行
print(special_list)
#进行分词
tokenizer=BertWordPieceTokenizer()
tokenizer.train(['result1.txt','result2.txt','result3.txt','result4.txt','result5.txt','result6.txt','result7.txt'],special_tokens=special_list)
#分词结束
vocab=tokenizer.get_vocab()#vocab是分词后的形成的字典,键是字,值是序号
after_list=sorted(vocab.items(),key=lambda item:item[1])#字典按值排序,after_list是列表,元素是元组
print(after_list)
inputfile=open('myvocab.txt','w',encoding='utf-8')#将字写入词典里
for single_word in after_list:
inputfile.write(single_word[0])#取元组第一个值,就是分词后的字
inputfile.write('\n')