deepspeech2 代码之数据处理

最新推荐文章于 2025-04-10 10:36:55 发布

hyxxxxxx

最新推荐文章于 2025-04-10 10:36:55 发布

阅读量1.8k

点赞数 1

分类专栏：语音识别

本文链接：https://blog.csdn.net/xuan100e/article/details/96870460

版权

语音识别专栏收录该内容

11 篇文章

订阅专栏

以Librispeech为例

step 1 下载数据集

下载地址: http://www.openslr.org/12/
下载文件

step 2 解压文件

tar -zxvf *.tar.gz ./

此处可以不解压代码中可以边解压边读取边删除但是为了效率这里选择先解压

step 3 定义参数

class parser():
    def __init__(self):
        self.target_dir = '../librispeech_out/'
        self.sample_rate = 16000
        self.files_to_use = '/data/LibriSpeech/LibriSpeech/'
        self.min_duration = 1
        self.max_duration = 15
args = parser()
LIBRI_SPEECH_DICTS = {
    'train':args.files_to_use + 'train-clean-360',
    'val' : args.files_to_use + 'dev-clean',
    'test-clean' : args.files_to_use + 'test-clean',  
}
LIBRI_SPEECH_URLS

这里因为使用的jupyter 所以写死了，如果想改的话可以用argparse写入即可

step 4 数据处理

代码分为三部分
1.创建文件夹
2.将原解压文件(解压好的文件)中的文件用os.walk深度优先遍历复制到自己创建的文件目录中
3.制作wav ,txt对应的csv文件

def main():
    target_dir = args.target_dir
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    for split_type, lst_libri_dic in LIBRI_SPEECH_DICTS.items():
        split_dir = os.path.join(target_dir, split_type) #librispeech_out/train dev test
        if not os.path.exists(split_dir):
            os.makedirs(split_dir)
        split_war_dir = os.path.join(split_dir, 'wav') #librispeech_out/train/wav
        if not os.path.exists(split_war_dir):
            os.makedirs(split_war_dir)
        split_txt_dir = os.path.join(split_dir, 'txt') #librispeech_out/train/txt
        if not os.path.exists(split_txt_dir):
            os.makedirs(split_txt_dir)
        '''
        #优化 不需要提前解压 创建解压临时文件
        extracted_dir = os.path.join(split_dir, "LibriSpeech")
        if os.path.exists(extracted_dir):
            shutil.rmtree(extracted_dir) #删除该目录下所有文件
        
        # 解压文件
        target_filename = 'train-clean-100.tar.gz'
        tar = tarfile.open(target_filename)
        tar.extractall(split_dir)
        tar.close()
        os.remove(target_filename)
        '''
        # 从解压文件中将所有文件放入 wav 和 txt 文件夹
        for root, subdirs, files in tqdm(os.walk(lst_libri_dic)):
            for f in files:
                if f.find('.flac') != -1:
                    _process_file(wav_dir=split_war_dir, txt_dir=split_txt_dir,
                                 base_filename=f, root_dir=root)
        # 删除         
        # shutil.rmtree(extracted_dir)
        if split_type == 'train':  # Prune to min/max duration
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration)
        else:
            create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')

4.1 创建文件夹

代码前13行是在创建保存数据的文件夹

4.2 复制文件到自定义目录

def _preprocess_transcript(phrase):
    return phrase.strip().upper()


def _process_file(wav_dir, txt_dir, base_filename, root_dir):
    full_recording_path = os.path.join(root_dir, base_filename)
    assert os.path.exists(full_recording_path) and os.path.exists(root_dir)
    wav_recording_path = os.path.join(wav_dir, base_filename.replace(".flac", ".wav"))
    subprocess.call(["sox {}  -r {} -b 16 -c 1 {}".format(full_recording_path, str(args.sample_rate),
                                                          wav_recording_path)], shell=True)
    # process transcript
    txt_transcript_path = os.path.join(txt_dir, base_filename.replace(".flac", ".txt"))
    transcript_file = os.path.join(root_dir, "-".join(base_filename.split('-')[:-1]) + ".trans.txt")
    assert os.path.exists(transcript_file), "Transcript file {} does not exist.".format(transcript_file)
    transcriptions = open(transcript_file).read().strip().split("\n")
    transcriptions = {t.split()[0].split("-")[-1]: " ".join(t.split()[1:]) for t in transcriptions}
    with open(txt_transcript_path, "w") as f:
        key = base_filename.replace(".flac", "").split("-")[-1]
        assert key in transcriptions, "{} is not in the transcriptions".format(key)
        f.write(_preprocess_transcript(transcriptions[key]))
        f.flush()

4.3 创建csv文件


def create_manifest(data_path, output_path, min_duration=None, max_duration=None):
    file_paths = [os.path.join(dirpath, f)
                  for dirpath, dirnames, files in os.walk(data_path)
                  for f in fnmatch.filter(files, '*.wav')]
    file_paths = order_and_prune_files(file_paths, min_duration, max_duration)
    with io.FileIO(output_path, "w") as file:
        for wav_path in tqdm(file_paths, total=len(file_paths)):
            transcript_path = wav_path.replace('/wav/', '/txt/').replace('.wav', '.txt')
            sample = os.path.abspath(wav_path) + ',' + os.path.abspath(transcript_path) + '\n'
            file.write(sample.encode('utf-8'))
    print('\n')


def order_and_prune_files(file_paths, min_duration, max_duration):
    print("Sorting manifests...")
    duration_file_paths = [(path, float(subprocess.check_output(
        ['soxi -D \"%s\"' % path.strip()], shell=True))) for path in file_paths]
    if min_duration and max_duration:
        print("Pruning manifests between %d and %d seconds" % (min_duration, max_duration))
        duration_file_paths = [(path, duration) for path, duration in duration_file_paths if
                               min_duration <= duration <= max_duration]

    def func(element):
        return element[1]

    duration_file_paths.sort(key=func)
    return [x[0] for x in duration_file_paths]  # Remove durations