from praatio import tgio
from pydub import AudioSegment
import os
import glob
def create_word_ctm():
with open("datasets/BBSpeech-1.0/metadata.csv",'r') as fp:
lines1 = fp.readlines()
words = {}
phones = {}
for i in lines1:
a,_,p,b = i.strip().split("|")
words[a] = b.split(" ")
phones[a] = p
with open("datasets/BBSpeech-1.0/duration.txt",'r') as fp:
lines2 = fp.readlines()
wavs = glob.glob("datasets/BBSpeech-1.0/mels/*.npy")
wavs.sort()
durans = {os.path.basename(wavs[i])[:-4]:[int(x) for x in l.split(',')] for i,l in enumerate(lines2)}
return durans,words,phones
if __name__ == '__main__':
durans,words,phones = create_word_ctm()
word_ctm = {}
phone_ctm = {}
wav_durs = {}
for name in words:
if name in durans:
wav_dur = AudioSegment.from_wav(os.path.join("datasets/BBSpeech-1.0/wavs",name+".wav")).duration_seconds
wav_durs[name] = wav_dur
ws = words[name]
phs = phones[name]
s_t = 0
s_t1 = 0
s_t2 = 0
sum_d = sum(durans[name])
word_ctm["wavs-" + name] = {}
word_ctm["wavs-" + name]['wavs'] = []
phone_ctm["wavs-" + name] = {}
phone_ctm["wavs-" + name]['wavs'] = []
for w in ws:
w_l = len(w)
d = durans[name][s_t:w_l+s_t]
s_t = s_t + w_l
d_sum_1 = sum(d)
d_sum_2 = d_sum_1 * wav_dur / sum_d
aa_start = round(s_t1,2)
aa_end = round(s_t1+d_sum_2,2)
if aa_start < aa_end:
word_ctm["wavs-" + name]['wavs'].append([aa_start,aa_end,w])
else:
word_ctm["wavs-" + name]['wavs'].append([aa_start, aa_end + 0.01, w])
s_t1 = s_t1 + d_sum_2
for i,p in enumerate(phs):
dd = durans[name][i]
d_sum_3 = dd * wav_dur / sum_d
bb_start = round(s_t2,2)
bb_end = round(s_t2 + d_sum_3,2)
if bb_start < bb_end:
phone_ctm["wavs-" + name]['wavs'].append([bb_start, bb_end, p])
else:
phone_ctm["wavs-" + name]['wavs'].append([bb_start, bb_end+0.01, p])
s_t2 = s_t2 + d_sum_3
#print(word_ctm)
tg = tgio.Textgrid()
frame_shift = 0.01
for i, (k, v) in enumerate(sorted(word_ctm.items())): # {'wavs-000001': {'wavs': [[0.14, 0.31, 'ka2'], [0.31, 0.46, 'er2'], [0.46, 0.74, 'pu3'], [0.74, 0.95, 'pei2'], [0.95, 1.15, 'wai4'], [1.15, 1.45, 'sun1'], [1.45, 1.62, 'wan2'], [1.62, 1.89, 'hua2'], [1.89, 2.25, 'ti1']]}, 'wavs-000002': {'wavs': [[0.15, 0.45, 'jia2'], [0.45, 0.61, 'yu3'], [0.61, 0.92, 'cun1'], [0.92, 1.18, 'yan2'], [1.27, 1.47, 'bie2'], [1.47, 1.69, 'zai4'], [1.69, 1.91, 'yong1'], [1.94, 2.18, 'bao4'], [2.18, 2.5, 'wo3']]}, ...}
#print(i,"0000000")
max_time = round(wav_durs[k.split("-")[1]],2)
speaker = list(v.keys())[0] # wavs
v = list(v.values())[0] # [[
tg = tgio.Textgrid()
try:
tg.minTimestamp = 0
tg.maxTimestamp = max_time # 2.424
phone_tier_len = len(phone_ctm[k][speaker]) # 17
words = []
phones = []
for interval in v:
if max_time - interval[1] < frame_shift: # Fix rounding issues
interval[1] = max_time
words.append(interval)
for j, interval in enumerate(phone_ctm[k][speaker]):
if j == phone_tier_len - 1: # sync last phone boundary to end of audio file
interval[1] = max_time
phones.append(interval)
word_tier = tgio.IntervalTier('words', words, minT=0, maxT=max_time)
phone_tier = tgio.IntervalTier('phones', phones, minT=0, maxT=max_time)
tg.addTier(word_tier)
tg.addTier(phone_tier)
speaker_directory = 'datasets/BBSpeech-1.0/textgrids_p' # out_directory=bb/out
os.makedirs(speaker_directory, exist_ok=True)
if k.startswith(speaker) and speaker in k.split('_')[1:]: # deal with prosodylab speaker prefixing #k=wavs-000001,speaker=wavs
k = '_'.join(k.split('_')[1:])
out_path = os.path.join(speaker_directory, k + '.TextGrid')
tg.save(out_path, useShortForm=False)
except:
print(phones)
pass
textgrid
于 2021-09-10 17:00:49 首次发布