零基础入门语音识别—基于CNN
环境要求
TensorFlow的版本:2.0+
keras—基于Python的深度学习库
sklearn
librosa —Python中用于音乐和音频分析的软件包,参考librosa官方文档librosa
读取训练集数据
from tqdm import tqdm
def extract_features(parent_dir,sub_dirs,max_file = 10,file_ext = "*.wav"):
c=0
label,feature = [],[]
for sub_dir in sub_dirs:
for fn in tqdm(glob.glob(os.path.join(parent_dir,sub_dir,file_ext))[:max_file]):
label_name = fn.split('/')[-2]
label.extend([label_dict[label_name]])
X,sample_rate = librosa.load(fn,res_type = 'kaiser_fast')#加载文件
mels = np.mean(librosa.feature.melspectrogram(y = X,sr = sample_rate).T,axis = 0)#计算梅尔频谱
feature.extend([mels])
return [feature,label]
建立模型
搭建CNN网络模型
model = Sequential()
#输入的大小
input_dim = (16, 8, 1)
model.add(Conv2D(64, (3, 3), padding = "same", activation = "tanh", input_shape = input_dim))# 卷积层
model.add(MaxPool2D(pool_size=(2, 2)))# 最大池化
model.add(Conv2D(128, (3, 3), padding = "same", activation = "tanh")) #卷积层
model.add(MaxPool2D(pool_size=(2, 2))) # 最大池化层
model.add(Dropout(0.1))
model.add(Flatten()) # 展开
model.add(Dense(1024, activation = "tanh"))
model.add(Dense(20, activation = "softmax")) # 输出层:20个units输出20个类的概率
#编译模型,设置损失函数,优化方法以及评价标准
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
模型训练
model.fit(X_train, Y_train, epochs = 20, batch_size = 15, validation_data = (X_test, Y_test))
预测测试集
def extract_features(test_dir,file_ext = "*.wav"):
feature = []
for fn in tqdm(glob.glob(os.path.join(test_dir,file_ext))[:]):
X,sample_rate = librosa.load(fn,res_type = 'kaiser_fast')
mels = np.mean(librosa.feature.melspectrogram(y = X,sr = sample_rate).T,axis = 0)
feature.extend([mels])
return feature
X_test = extract_features('./test_a/')
X_test = np.vstack(X_test)
predictions = model.predict(X_test.reshape(-1,16,8,1))
preds = np.argmax(predictions,axis = 1)
preds = [label_dict_inv[x] for x in preds]
path = glob.glob('./test_a/*.wav')
result = pd.DataFrame({'name':path,'label':preds})
result['name'] = result['name'].apply(lambda x : x.split('/')[-1])
result.to_csv('submit.csv',index = None)