def segment(src_dir):
segment_id = 0
lines = open(src_dir,'r').readlines()
temp = lines[0].split(' ')[1].split('_')[1]
wf = open('./segments_new', 'a', encoding='UTF-8-sig')
for line in lines:
utt = line.split(" ")[1]
utt_id = utt.split('_')[1]
start = line.split(" ")[2]
end = line.split(" ")[3]
if utt_id == temp:
segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
print(segment_id_str,utt,start,end)
segment_id += 1
wf.write(segment_id_str + ' ' + utt + ' ' + start + ' ' + end + '\n')
else:
temp = utt_id
segment_id = 0
segment_id_str = "{}_{}".format(utt, str(segment_id).zfill(4))
print(segment_id_str)
segment_id += 1
wf.write(segment_id_str + ' ' + utt + ' ' + start + ' ' + end + '\n')
segment('./segments')
处理前:AMI中MDM格式的音频文件,用kaldi的脚本处理后得到的segment文件
处理后:改了第一列的数据,从0000开始,一个文件结束后,下个文件从0000重新开始命名文件,从左到右分别为:segment_id_str,utt,start,end