textgrid

from praatio import tgio
from pydub import AudioSegment
import os
import glob
def create_word_ctm():
    with open("datasets/BBSpeech-1.0/metadata.csv",'r') as fp:
        lines1 = fp.readlines()
    words = {}
    phones = {}
    for i in lines1:
        a,_,p,b = i.strip().split("|")
        words[a] = b.split(" ")
        phones[a] = p
    with open("datasets/BBSpeech-1.0/duration.txt",'r') as fp:
        lines2 = fp.readlines()
    wavs = glob.glob("datasets/BBSpeech-1.0/mels/*.npy")
    wavs.sort()
    durans = {os.path.basename(wavs[i])[:-4]:[int(x) for x in l.split(',')] for i,l in enumerate(lines2)}
    return durans,words,phones
if __name__ == '__main__':
    durans,words,phones = create_word_ctm()
    word_ctm = {}
    phone_ctm = {}
    wav_durs = {}
    for name in words:
        if name in durans:
            wav_dur = AudioSegment.from_wav(os.path.join("datasets/BBSpeech-1.0/wavs",name+".wav")).duration_seconds
            wav_durs[name] = wav_dur
            ws = words[name]
            phs = phones[name]
            s_t = 0
            s_t1 = 0
            s_t2 = 0
            sum_d = sum(durans[name])
            word_ctm["wavs-" + name] = {}
            word_ctm["wavs-" + name]['wavs'] = []
            phone_ctm["wavs-" + name] = {}
            phone_ctm["wavs-" + name]['wavs'] = []
            for w in ws:
                w_l = len(w)
                d = durans[name][s_t:w_l+s_t]
                s_t = s_t + w_l
                d_sum_1 = sum(d)
                d_sum_2 = d_sum_1 * wav_dur / sum_d
                aa_start = round(s_t1,2)
                aa_end = round(s_t1+d_sum_2,2)
                if aa_start < aa_end:
                    word_ctm["wavs-" + name]['wavs'].append([aa_start,aa_end,w])
                else:
                    word_ctm["wavs-" + name]['wavs'].append([aa_start, aa_end + 0.01, w])
                s_t1 = s_t1 + d_sum_2
            for i,p in enumerate(phs):
                dd = durans[name][i]
                d_sum_3 = dd * wav_dur / sum_d
                bb_start = round(s_t2,2)
                bb_end = round(s_t2 + d_sum_3,2)
                if bb_start < bb_end:
                    phone_ctm["wavs-" + name]['wavs'].append([bb_start, bb_end, p])
                else:
                    phone_ctm["wavs-" + name]['wavs'].append([bb_start, bb_end+0.01, p])
                s_t2 = s_t2 + d_sum_3
    #print(word_ctm)
    tg = tgio.Textgrid()
    frame_shift = 0.01
    for i, (k, v) in enumerate(sorted(word_ctm.items())):  # {'wavs-000001': {'wavs': [[0.14, 0.31, 'ka2'], [0.31, 0.46, 'er2'], [0.46, 0.74, 'pu3'], [0.74, 0.95, 'pei2'], [0.95, 1.15, 'wai4'], [1.15, 1.45, 'sun1'], [1.45, 1.62, 'wan2'], [1.62, 1.89, 'hua2'], [1.89, 2.25, 'ti1']]}, 'wavs-000002': {'wavs': [[0.15, 0.45, 'jia2'], [0.45, 0.61, 'yu3'], [0.61, 0.92, 'cun1'], [0.92, 1.18, 'yan2'], [1.27, 1.47, 'bie2'], [1.47, 1.69, 'zai4'], [1.69, 1.91, 'yong1'], [1.94, 2.18, 'bao4'], [2.18, 2.5, 'wo3']]}, ...}
        #print(i,"0000000")
        max_time = round(wav_durs[k.split("-")[1]],2)
        speaker = list(v.keys())[0]  # wavs
        v = list(v.values())[0]  # [[
        tg = tgio.Textgrid()
        try:
            tg.minTimestamp = 0
            tg.maxTimestamp = max_time  # 2.424
            phone_tier_len = len(phone_ctm[k][speaker])  # 17
            words = []
            phones = []
            for interval in v:
                if max_time - interval[1] < frame_shift:  # Fix rounding issues
                    interval[1] = max_time
                words.append(interval)
            for j, interval in enumerate(phone_ctm[k][speaker]):
                if j == phone_tier_len - 1:  # sync last phone boundary to end of audio file
                    interval[1] = max_time
                phones.append(interval)
            word_tier = tgio.IntervalTier('words', words, minT=0, maxT=max_time)
            phone_tier = tgio.IntervalTier('phones', phones, minT=0, maxT=max_time)
            tg.addTier(word_tier)
            tg.addTier(phone_tier)

            speaker_directory = 'datasets/BBSpeech-1.0/textgrids_p'  # out_directory=bb/out
            os.makedirs(speaker_directory, exist_ok=True)
            if k.startswith(speaker) and speaker in k.split('_')[1:]:  # deal with prosodylab speaker prefixing #k=wavs-000001,speaker=wavs
                k = '_'.join(k.split('_')[1:])
            out_path = os.path.join(speaker_directory, k + '.TextGrid')
            tg.save(out_path, useShortForm=False)
        except:
            print(phones)
            pass
        
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值