#-*-coding:utf-8-*-
from xpinyin import Pinyin
import glob
import codecs
import pkuseg
symbols = u''' !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~£§°±·×÷ˇˉ—‖‘’“”‰′※、。〃々〆〇〈〉《》「」『』【】〒〓〔〕〖〗!,?¢£¥'''
if __name__ == "__main__":
trn_paths = glob.glob(u'''./test_std_bak/*.trn.trans''')
count = 0
seg = pkuseg.pkuseg()
p = Pinyin()
for trn_file in trn_paths:
print(trn_file)
count = count + 1
coe = ""
pinyind = ""
with codecs.open(trn_file,'r','utf-8') as fpr:
text_ = fpr.read()
text = seg.cut(text_)
for i in text:
if i not in symbols:
coe = coe + " " + i
pinyind = pinyind + " " + p.get_pinyin(i,tone_marks="numbers",splitter=" ")
coe = coe.strip(" ")
pinyind = pinyind.strip(" ")
with codecs.open(trn_file,'w','utf-8') as fpw:
fpw.write(coe)
fpw.write("\n")
fpw.write(pinyind)
print(u"已处理第%d个文件:%s" %(count,trn_file))
u’[takcb,]#stcmd数据集中的特殊字符