使用卷积神经网络实现简易的语音分类的对话系统-CSDN博客

本文链接：https://blog.csdn.net/qq_38641985/article/details/121792271

一、思路

1.收集简单的语音词汇，数量越多越好，当做数据集。

2. 为每个词汇建立词汇内容的标签，建立一个字典，键值为文本词语，键为数字标签，训练时只能使用数字表示。

3.建立卷积神经网络或其他网络

4.提取音频特征，对应上标签进行训练，输出训练模型

5.利用输出的模型对音频文件进行预测

6.根据预测的结果，对应相应的回答的音频文件，做出应答。

1.数据集收集

在这里插入图片描述

说明：数据集过小，只是进行网络测试，实际数据较多；而且这里的音频数据都是通过软件合成的数据，比较理想，适合新人学习使用。

2. 建立一个映射字典

Dict_data = {
0:"吃饭",
1:"等一下",
2:"不吃了"
}

3.建立神经网络

model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(25*11,)))
model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.summary()

4.提取音频特征

def wav2mfcc(path, max_pad_size=11):
    y, sr = librosa.load(path=path, sr=None, mono=False)
    y = y[::3]
    # 默认提取 20 帧
    audio_mac = librosa.feature.mfcc(y=y,n_mfcc=25, sr=16000)
    y_shape = audio_mac.shape[1]
    if y_shape < max_pad_size:
        pad_size = max_pad_size - y_shape
        audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')
    else:
        audio_mac = audio_mac[:, :max_pad_size]
    return audio_mac

5.利用输出的模型对音频文件预测

import numpy as np
import librosa
from glob import glob
from tensorflow.keras.models import Sequential,load_model
from wav import Dict_data
from playsound import  playsound

WAV_DICT = {
	0:"../info/eat.wav",
	1:"../info/waiting.wav",
	2:"../info/stop.wav"
}
def wav2mfcc(path, max_pad_size=11):
    y, sr = librosa.load(path=path, sr=None, mono=False)
    y = y[::3]
    # 默认提取 20 帧
    audio_mac = librosa.feature.mfcc(y=y,n_mfcc=25, sr=16000)
    y_shape = audio_mac.shape[1]
    if y_shape < max_pad_size:
        pad_size = max_pad_size - y_shape
        audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')
    else:
        audio_mac = audio_mac[:, :max_pad_size]
    return audio_mac


def predict_audio(file):
	print (file)
	wav_feature = wav2mfcc(file)
	wav_feature = wav_feature.reshape(-1, 25*11)
	print (wav_feature.shape)
	model = load_model("audio.h5")
	result_list = model.predict(wav_feature)
	print (result_list)
	max_result = result_list.argmax()
	print (max_result)
	print ("预测结果为:",Dict_data[max_result])
	playsound(WAV_DICT[max_result])




def main():
	#predict_audio(file="wav/2/3_.wav")
	predict_audio(file="../datasets/2/003.wav")
	#predict_audio(file="2_2.wav")


main()

6. 根据预测结果进行应答

WAV_DICT = {
	0:"../info/eat.wav",
	1:"../info/waiting.wav",
	2:"../info/stop.wav"
}

说明：当预测为0的时候需要播放对应的文件，其他的同理；需要提前找好音频文件。

完整训练代码


import numpy as np
import librosa
from glob import glob
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import Sequential,load_model
from keras.utils.np_utils import to_categorical
import pandas as pd
import matplotlib.pyplot as plt

#https://blog.csdn.net/weixin_43773093/article/details/85488271

def wav2mfcc(path, max_pad_size=11):
    y, sr = librosa.load(path=path, sr=None, mono=False)
    y = y[::3]
    # 默认提取 20 帧
    audio_mac = librosa.feature.mfcc(y=y,n_mfcc=25, sr=16000)
    y_shape = audio_mac.shape[1]
    if y_shape < max_pad_size:
        pad_size = max_pad_size - y_shape
        audio_mac = np.pad(audio_mac, ((0, 0), (0, pad_size)), mode='constant')
    else:
        audio_mac = audio_mac[:, :max_pad_size]
    return audio_mac
def tf_datasets(path):
  wav_file = glob(path+"/*/*.wav")
  #print (wav_file)
  wav_file_len = len(wav_file)
  x_train = np.zeros((wav_file_len,25,11))
  y_train = np.zeros((wav_file_len))

  for index,file in enumerate(wav_file):
      file_feature = wav2mfcc(file)
      file_label = file.split("\\")[-2]
      x_train[index] = file_feature
      y_train[index] = file_label
      print (file_feature.shape,file_label)
  return x_train,y_train

def save_plot(history):
    pd.DataFrame(history.history).plot(figsize=(8,5))
    plt.grid(True)
    plt.gca().set_ylim(0,3)
    plt.gca().set_xlim(0,5)
    plt.show()

def main():
    #f,l = get_audio_feature("datasets/0/003.wav")
    #print (f.shape,l)

    #t = wav2mfcc("datasets/0/004.wav")
    #print (t.shape)

    x_train,y_train = tf_datasets(path="../datasets")
    x_train = x_train.reshape(-1, 25*11)
    y_train_hot = to_categorical(y_train)

    print (x_train.shape,y_train.shape)

    
    model = Sequential()
    model.add(Dense(128, activation='relu', input_shape=(25*11,)))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    """
    model= Sequential()
    model.add(Dense(128,activation='relu',input_shape=(25*11,)))
    model.add(Dropout(0.1))

    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.1))

    model.add(Dense(64,activation='relu'))
    model.add(Dropout(0.1))

    model.add(Dense(3, activation='softmax'))
    """
    model.summary()

    model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  optimizer=tf.keras.optimizers.RMSprop(),
                  metrics=['accuracy'])
    history = model.fit(x_train, y_train, batch_size=12, epochs=2000)
    model.save("audio.h5")
    #save_plot(history)



main()