pyAudioKits是基于librosa和其他库的强大Python音频工作流支持。
通过pip安装:
pip install pyAudioKits
本项目的GitHub地址,如果这个项目帮助到了你,请为它点上一颗star,谢谢你的支持!如果你在使用过程中有任何问题,请在评论区留言或在GitHub上提issue,我将持续对该项目进行维护。
本节介绍一个使用pyAudioKits读取并提取音频特征,然后使用深度学习方法来进行语音识别的小项目。我们将提取梅尔频谱作为特征,用Keras+Tensorflow来搭建一个简单的卷积神经网络分类器,实现0-9这十个语音数字的识别。
-
所用的数据集来自:https://www.kaggle.com/datasets/bharatsahu/speech-commands-classification-dataset。请先下载该数据集,并保存为"speech-commands-classification-dataset"目录。
-
算法参考自:https://www.kaggle.com/code/ritvik1909/speech-classification-spectrogram-cnn
import matplotlib.pyplot as plt
import os
from tensorflow.keras import layers, models, callbacks
from keras.preprocessing.image import ImageDataGenerator
import pyAudioKits.audio as ak
import pyAudioKits.analyse as aly
首先生成预处理数据的路径
root="speech-commands-classification-dataset"
train_folders="train_set"
val_folders="val_set"
test_folders="test_set"
if not os.path.exists(train_folders):
os.mkdir(train_folders)
if not os.path.exists(val_folders):
os.mkdir(val_folders)
if not os.path.exists(test_folders):
os.mkdir(test_folders)
接下来开始提取梅尔频谱特征。我们先在0-9每个数字下选择一个样本,提取特征并可视化。
input_size = (128, 128, 1)
labels=["zero","one","two","three","four","five","six","seven","eight","nine"]
plt.tight_layout()
fig,axes=plt.subplots(2,5,sharex=True,sharey=True)
fig.set_size_inches(12,6)
#迭代训练集、验证集和测试集,并获取每个语音数字下的第一个样本
for i,d in enumerate(labels):
path=root+"/"+d
for w in os.listdir(path):
wf = ak.read_Audio(path+"/"+w) #读取语音数字
wf = wf.padding(22050) #将语音数字的样本长度补充到22050,为采样率的一半,使得频域和时域可以使用相同分辨率
features= aly.melSpec(wf, spec_h = input_size[0], spec_w= input_size[1]) #提取梅尔频谱,设置宽高均为128。其中高度决定了梅尔频谱的频率分辨率,而宽度决定了其时间分辨率
axes[int(i/5),i%5].set_title(d)
axes[int(i/5),i%5].imshow(features) #绘制梅尔频谱特征
break
接下来正式开始提取特征。迭代数据集、提取特征并将其存为图片。
val_list=[str(i)[2:-3] for i in open(root+"/validation_list.txt","rb").readlines()]
test_list=[str(i)[2:-3] for i in open(root+"/testing_list.txt","rb").readlines()]
maxcount=[1839,239,230]
for d in labels:
count=[0,0,0]
path=root+"/"+d
dst=[None,None,None]
dst[0]=train_folders+"/"+d
dst[1]=val_folders+"/"+d
dst[2]=test_folders+"/"+d
for w in os.listdir(path):
if d+"/"+w in val_list: #如果在val_list内,则归类到验证集
pos=1
elif d+"/"+w in test_list: #如果在test_list内,则归类到测试集
pos=2
else: #否则,归类到训练集
pos=0
count[pos]+=1
if count[pos]>maxcount[pos]:
continue
wf = ak.read_Audio(path+"/"+w)
wf = wf.padding(22050)
features= aly.melSpec(wf, spec_h = input_size[0], spec_w= input_size[1])
if not os.path.exists(dst[pos]):
os.mkdir(dst[pos])
if not os.path.exists(dst[pos]):
os.mkdir(dst[pos])
plt.imsave(dst[pos]+"/"+w[:-4]+".png", features, cmap='gray') #保存梅尔频谱特征为图片
print(d)
'''
outputs:
zero
one
two
three
four
five
six
seven
eight
nine
'''
使用Keras的API载入图片数据集,这样就把语音分类问题本质上转化为了一个图片分类问题。
training_set = ImageDataGenerator().flow_from_directory(
train_folders, target_size=input_size[:-1], batch_size=32, class_mode='categorical', color_mode='grayscale'
)
validation_set = ImageDataGenerator().flow_from_directory(
val_folders, target_size=input_size[:-1], batch_size=32, class_mode='categorical', color_mode='grayscale'
)
test_set = ImageDataGenerator().flow_from_directory(
test_folders, target_size=input_size[:-1], batch_size=32, class_mode='categorical', color_mode='grayscale'
)
'''
outputs:
Found 18390 images belonging to 10 classes.
Found 2369 images belonging to 10 classes.
Found 2300 images belonging to 10 classes.
'''
构造卷积神经网络、优化器和损失函数。
#构造卷积神经网络
model = models.Sequential([
layers.Conv2D(32, 3, activation='relu', input_shape=(128,128,1), padding='same'),
layers.Conv2D(32, 3, activation='relu', padding='same'),
layers.MaxPooling2D(padding='same'),
layers.Dropout(0.25),
layers.Conv2D(64, 3, activation='relu', padding='same'),
layers.Conv2D(64, 3, activation='relu', padding='same'),
layers.MaxPooling2D(padding='same'),
layers.Dropout(0.25),
layers.GlobalAveragePooling2D(),
layers.Dense(len(labels), activation='softmax'),
])
model.summary()
#优化器采用Adam;损失函数采用交叉熵损失函数;earlystopping指标采用准确率
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'],)
'''
outputs:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
conv2d (Conv2D) (None, 128, 128, 32) 320
_________________________________________________________________
conv2d_1 (Conv2D) (None, 128, 128, 32) 9248
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 64, 64, 32) 0
_________________________________________________________________
dropout (Dropout) (None, 64, 64, 32) 0
_________________________________________________________________
conv2d_2 (Conv2D) (None, 64, 64, 64) 18496
_________________________________________________________________
conv2d_3 (Conv2D) (None, 64, 64, 64) 36928
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 32, 64) 0
_________________________________________________________________
dropout_1 (Dropout) (None, 32, 32, 64) 0
_________________________________________________________________
global_average_pooling2d (Gl (None, 64) 0
_________________________________________________________________
dense (Dense) (None, 10) 650
=================================================================
Total params: 65,642
Trainable params: 65,642
Non-trainable params: 0
_________________________________________________________________
'''
开始训练模型。训练时采用earlystopping和学习率衰减策略。
#采用earlystopping和学习率衰减策略
es = callbacks.EarlyStopping(monitor='val_loss', patience=5, verbose=1, restore_best_weights=True)
rlp = callbacks.ReduceLROnPlateau( monitor='val_loss', factor=0.1, patience=2, min_lr=1e-10, mode='min', verbose=1)
#训练模型
history = model.fit(
training_set, validation_data=validation_set,
epochs=500, callbacks=[es, rlp], batch_size=8
)
'''
outputs:
Epoch 1/500
575/575 [==============================] - 921s 2s/step - loss: 2.1807 - accuracy: 0.1993 - val_loss: 1.9604 - val_accuracy: 0.2921
Epoch 2/500
575/575 [==============================] - 820s 1s/step - loss: 1.8535 - accuracy: 0.3332 - val_loss: 1.7405 - val_accuracy: 0.3854
Epoch 3/500
575/575 [==============================] - 832s 1s/step - loss: 1.6132 - accuracy: 0.4388 - val_loss: 1.4009 - val_accuracy: 0.5395
Epoch 4/500
575/575 [==============================] - 824s 1s/step - loss: 1.2987 - accuracy: 0.5673 - val_loss: 1.1822 - val_accuracy: 0.5741
Epoch 5/500
575/575 [==============================] - 824s 1s/step - loss: 1.0400 - accuracy: 0.6605 - val_loss: 0.8329 - val_accuracy: 0.7341
Epoch 6/500
575/575 [==============================] - 841s 1s/step - loss: 0.8287 - accuracy: 0.7354 - val_loss: 0.7591 - val_accuracy: 0.7455
Epoch 7/500
575/575 [==============================] - 836s 1s/step - loss: 0.7186 - accuracy: 0.7748 - val_loss: 0.6612 - val_accuracy: 0.7940
Epoch 8/500
575/575 [==============================] - 829s 1s/step - loss: 0.6044 - accuracy: 0.8136 - val_loss: 0.7732 - val_accuracy: 0.7497
Epoch 9/500
575/575 [==============================] - 829s 1s/step - loss: 0.5407 - accuracy: 0.8302 - val_loss: 0.5135 - val_accuracy: 0.8328
Epoch 10/500
575/575 [==============================] - 845s 1s/step - loss: 0.4887 - accuracy: 0.8482 - val_loss: 0.3927 - val_accuracy: 0.8776
Epoch 11/500
575/575 [==============================] - 932s 2s/step - loss: 0.4534 - accuracy: 0.8581 - val_loss: 0.4472 - val_accuracy: 0.8531
Epoch 12/500
575/575 [==============================] - 831s 1s/step - loss: 0.4230 - accuracy: 0.8694 - val_loss: 0.3817 - val_accuracy: 0.8725
Epoch 13/500
575/575 [==============================] - 831s 1s/step - loss: 0.3928 - accuracy: 0.8812 - val_loss: 0.3996 - val_accuracy: 0.8713
Epoch 14/500
575/575 [==============================] - 839s 1s/step - loss: 0.3644 - accuracy: 0.8908 - val_loss: 0.3484 - val_accuracy: 0.8839
Epoch 15/500
575/575 [==============================] - 858s 1s/step - loss: 0.3474 - accuracy: 0.8936 - val_loss: 0.3596 - val_accuracy: 0.8869
Epoch 16/500
575/575 [==============================] - ETA: 0s - loss: 0.3291 - accuracy: 0.8980
Epoch 00016: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
575/575 [==============================] - 845s 1s/step - loss: 0.3291 - accuracy: 0.8980 - val_loss: 0.3541 - val_accuracy: 0.8852
Epoch 17/500
575/575 [==============================] - 846s 1s/step - loss: 0.2301 - accuracy: 0.9314 - val_loss: 0.2641 - val_accuracy: 0.9152
Epoch 18/500
575/575 [==============================] - 844s 1s/step - loss: 0.2195 - accuracy: 0.9332 - val_loss: 0.2644 - val_accuracy: 0.9152
Epoch 19/500
575/575 [==============================] - ETA: 0s - loss: 0.2157 - accuracy: 0.9347
Epoch 00019: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
575/575 [==============================] - 849s 1s/step - loss: 0.2157 - accuracy: 0.9347 - val_loss: 0.2731 - val_accuracy: 0.9181
Epoch 20/500
575/575 [==============================] - 841s 1s/step - loss: 0.2034 - accuracy: 0.9395 - val_loss: 0.2667 - val_accuracy: 0.9147
Epoch 21/500
575/575 [==============================] - 829s 1s/step - loss: 0.1991 - accuracy: 0.9419 - val_loss: 0.2618 - val_accuracy: 0.9156
Epoch 22/500
575/575 [==============================] - 832s 1s/step - loss: 0.2027 - accuracy: 0.9402 - val_loss: 0.2621 - val_accuracy: 0.9177
Epoch 23/500
575/575 [==============================] - 829s 1s/step - loss: 0.1975 - accuracy: 0.9417 - val_loss: 0.2615 - val_accuracy: 0.9168
Epoch 24/500
575/575 [==============================] - 826s 1s/step - loss: 0.1981 - accuracy: 0.9411 - val_loss: 0.2614 - val_accuracy: 0.9168
Epoch 25/500
575/575 [==============================] - 830s 1s/step - loss: 0.1989 - accuracy: 0.9407 - val_loss: 0.2616 - val_accuracy: 0.9181
Epoch 26/500
575/575 [==============================] - 837s 1s/step - loss: 0.1976 - accuracy: 0.9414 - val_loss: 0.2606 - val_accuracy: 0.9185
Epoch 27/500
575/575 [==============================] - 837s 1s/step - loss: 0.2008 - accuracy: 0.9380 - val_loss: 0.2587 - val_accuracy: 0.9177
Epoch 28/500
575/575 [==============================] - 838s 1s/step - loss: 0.1980 - accuracy: 0.9411 - val_loss: 0.2584 - val_accuracy: 0.9211
Epoch 29/500
575/575 [==============================] - 843s 1s/step - loss: 0.1989 - accuracy: 0.9407 - val_loss: 0.2579 - val_accuracy: 0.9190
Epoch 30/500
575/575 [==============================] - 842s 1s/step - loss: 0.2006 - accuracy: 0.9401 - val_loss: 0.2608 - val_accuracy: 0.9177
Epoch 31/500
575/575 [==============================] - ETA: 0s - loss: 0.1966 - accuracy: 0.9398
Epoch 00031: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
575/575 [==============================] - 839s 1s/step - loss: 0.1966 - accuracy: 0.9398 - val_loss: 0.2614 - val_accuracy: 0.9185
Epoch 32/500
575/575 [==============================] - 848s 1s/step - loss: 0.1947 - accuracy: 0.9411 - val_loss: 0.2600 - val_accuracy: 0.9190
Epoch 33/500
575/575 [==============================] - ETA: 0s - loss: 0.1970 - accuracy: 0.9409
Epoch 00033: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
575/575 [==============================] - 853s 1s/step - loss: 0.1970 - accuracy: 0.9409 - val_loss: 0.2599 - val_accuracy: 0.9190
Epoch 34/500
575/575 [==============================] - ETA: 0s - loss: 0.1961 - accuracy: 0.9410Restoring model weights from the end of the best epoch.
575/575 [==============================] - 849s 1s/step - loss: 0.1961 - accuracy: 0.9410 - val_loss: 0.2598 - val_accuracy: 0.9190
Epoch 00034: early stopping
'''
训练结束,对结果进行可视化。绘制训练集和验证集上损失和准确率随时间变化图。
import pandas as pd
#绘制训练集和验证集上损失和准确率随时间变化图
fig, ax = plt.subplots(2, 1, figsize=(20, 8))
df = pd.DataFrame(history.history)
df[['accuracy', 'val_accuracy']].plot(ax=ax[0])
df[['loss', 'val_loss']].plot(ax=ax[1])
ax[0].set_title('Accuracy', fontsize=15)
ax[1].set_title('Loss', fontsize=15)
ax[0].grid(linestyle="-.")
ax[1].grid(linestyle="-.")
在测试集上进行测试的结果是测试集损失为0.245,准确率为0.93。
model.evaluate(test_set)
'''
outputs:
72/72 [==============================] - 24s 331ms/step - loss: 0.2450 - accuracy: 0.9304
[0.24495860934257507, 0.9304347634315491]
'''