- 已知:(1)语音数据:包括文件夹speech,noise(训练数据、验证数据及测试数据未分类);(2)训练数据文本文档speech.txt、验证数据文本文档val.txt、测试数据文本文档core_test192.txt;
- 根据已知创建所需分类数据。
- 具体实现代码:
import os
import soundfile as sf
from Four_DNN_Denoise import config_s
from Four_DNN_Denoise.utils import create_folder, read_audio, write_audio
def data_ready(data_type):
workspace = config_s.workspace # 工作空间
data_dir = config_s.data_dir # premix_data数据目录
snr = config_s.SNR
fs = config_s.fs
tra_speech_filename = os.path.join(workspace,'speech.txt') # 训练数据(1800条)
val_speech_filename = os.path.join(workspace, 'val.txt') # 验证数据(200条)
core_test192_filename = os.path.join(workspace, 'core_test192.txt') # 测试数据(192条)
all_filename = os.path.join(workspace, 'all.txt') # 所有数据(2192条)
speech_dir = os.path.join(data_dir, 'speech')
noise_dir = os.path.join(data_dir, 'noise') # 噪声(noise)目录
noise_name = [na for na in os.listdir(noise_dir) if na.lower().endswith(".wav")]
print(f'noise_name:{noise_name}')
if data_type == 'train':
i = 1
# 创建训练数据文件夹,存放训练数据
with open(tra_speech_filename, 'r') as tra_file_to_read:
# while True: # 不能用while True,因为读到最后一行时,为空
while tra_file_to_read.readline(): # 只要读取的行不为空,则继续
# for i in range(1, 1800):
# 读取训练数据
i += 1
if i == 1799:
print(f'i={i}')
lines = tra_file_to_read.readline() # 整行读取数据,也即获取语音数据对应的文件名
# print(lines)
tmp = lines.split()[0] # 去除“\n”
speech_na_path = os.path.join(data_dir, "speech", "%s.%s" % (tmp, "wav")) # 语音所在路径
speech = read_audio(speech_na_path) # 读取语音数据
tr_speech_path = os.path.join(data_dir, "train_data", "%s.wav" % tmp) # 定义存放训练数据的文件夹
create_folder(os.path.dirname(tr_speech_path)) # 创建存放训练数据的文件夹
write_audio(tr_speech_path, speech, fs) # 写入语音数据
tra_file_to_read.close() # 关闭文件
elif data_type == 'val':
# 创建验证数据文件夹,存放验证数据
with open(val_speech_filename, 'r') as val_file_to_read:
# while True:
while val_file_to_read.readline():
# for i in range(1, 200):
# 读取验证数据
lines = val_file_to_read.readline() # 整行读取数据,也即获取语音数据对应的文件名
# print(lines)
tmp = lines.split()[0] # 去除“\n”
speech_na_path = os.path.join(data_dir, "speech", "%s.%s" % (tmp, "wav"))
speech = read_audio(speech_na_path)
tr_speech_path = os.path.join(data_dir, "val_data", "%s.wav" % tmp)
create_folder(os.path.dirname(tr_speech_path))
write_audio(tr_speech_path, speech, fs)
val_file_to_read.close()
elif data_type == 'test':
# 创建测试数据文件夹,存放测试数据
with open(core_test192_filename, 'r') as tes_file_to_read:
while tes_file_to_read.readline():
# for i in range(1, 192):
line = tes_file_to_read.readline()
tmp = os.path.splitext(line)[0] # 示例:tmp:F:\BaiduNetdiskDownload\TIMIT-wav\TEST\DR1\MDAB0\SI1039
tmp_ = tmp.split("\\")[6] # 示例:tmp_:SI1039
with open(all_filename, 'r') as all_file_to_read:
# for i in range(1, 2192):
while all_file_to_read.readline():
all_line = all_file_to_read.readline()
all_tmp = all_line.split()[0] # 测试数据的文件名称 # 示例:DR1_FMEM0_SX387
all_tmp_ = all_tmp.split("_")[2] # 示例:SX387
if all_tmp_ == tmp_:
# print(all_tmp)
tes_na = os.path.join(data_dir, "speech", "%s.wav" % all_tmp)
speech = read_audio(tes_na)
tes_speech_path = os.path.join(data_dir, "tes_data", "%s.wav" % all_tmp)
create_folder(os.path.dirname(tes_speech_path))
write_audio(tes_speech_path, speech, fs)
all_file_to_read.close()
tes_file_to_read.close()
else:
print("Data_type Error!")
pass
def main():
data_ready('train')
data_ready('val')
data_ready('test')
pass
if __name__ == '__main__':
main()