【人工智能项目】深度学习实现50种环境声音分类

本文介绍了一个使用深度学习实现的项目,针对50种环境声音进行分类,包括数据提取、特征转换、数据增强及模型构建。通过ESC-50数据集训练,采用Mel-Spectrogram并应用多种数据增强技术提升模型性能。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

【人工智能项目】深度学习实现50种环境声音分类

在这里插入图片描述

任务说明

本次准确识别5种大类,共计50种小类的音频。每个音频文件时长5秒,格式为wav。

数据集来自freesound.org公开项目,从中手动提取。训练集共计1600个,测试集400个。
在这里插入图片描述

导包

import os
import random
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn import model_selection
from sklearn import preprocessing
import IPython.display as ipd

前期准备

# define directories
base_dir = "."
esc_dir = os.path.join(base_dir, "ESC-50-master")
meta_file = os.path.join(esc_dir, "esc50.csv")
audio_dir = os.path.join(esc_dir, "audio/audio/")
# load metadata
meta_data = pd.read_csv(meta_file,header=None,names=["filename","target"])
meta_data

在这里插入图片描述

# get data size
data_size = meta_data.shape
print(data_size)

(2000, 2)

加载数据

# load a wave data
def load_wave_data(audio_dir, file_name):
    file_path = os.path.join(audio_dir, file_name)
    x, fs = librosa.load(file_path, sr=44100)
    return x,fs
# change wave data to mel-stft
def calculate_melsp(x, n_fft=1024, hop_length=128):
    stft = np.abs(librosa.stft(x, n_fft=n_fft, hop_length=hop_length))**2
    log_stft = librosa.power_to_db(stft)
    melsp = librosa.feature.melspectrogram(S=log_stft,n_mels=128)
    return melsp
# display wave in plots
def show_wave(x):
    plt.plot(x)
    plt.show()
# display wave in heatmap
def show_melsp(melsp, fs):
    librosa.display.specshow(melsp, sr=fs)
    plt.colorbar()
    plt.show()
# example data
x, fs = load_wave_data(audio_dir, meta_data.loc[0,"filename"])
melsp = calculate_melsp(x)
print("wave size:{0}\nmelsp size:{1}\nsamping rate:{2}".format(x.shape, melsp.shape, fs))
show_wave(x)
show_melsp(melsp, fs)

wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100

在这里插入图片描述
在这里插入图片描述

ipd.Audio(x, rate=fs)

augment audio data

# data augmentation: add white noise
def add_white_noise(x, rate=0.002):
    return x + rate*np.random.randn(len(x))
x_wn = add_white_noise(x)
melsp = calculate_melsp(x_wn)
print("wave size:{0}\nmelsp size:{1}\nsamping rate:{2}".format(x_wn.shape, melsp.shape, fs))
show_wave(x_wn)
show_melsp(melsp, fs)

wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100

在这里插入图片描述
在这里插入图片描述

ipd.Audio(x_wn, rate=fs)
# data augmentation: shift sound in timeframe
def shift_sound(x, rate=2):
    return np.roll(x, int(len(x)//rate))
x_ss = shift_sound(x)
melsp = calculate_melsp(x_ss)
print("wave size:{0}\nmelsp size:{1}\nsamping rate:{2}".format(x_ss.shape, melsp.shape, fs))
show_wave(x_ss)
show_melsp(melsp, fs)

melsp size:(128, 1723)
samping rate:44100

在这里插入图片描述
在这里插入图片描述

ipd.Audio(x_ss, rate=fs)
# data augmentation: stretch sound
def stretch_sound(x, rate=1.1):
    input_length = len(x)
    x = librosa.effects.time_stretch(x, rate)
    if len(x)>input_length:
        return x[:input_length]
    else:
        return np.pad(x, (0, max(0, input_length - len(x))), "constant")
x_st = stretch_sound(x)
melsp = calculate_melsp(x_st)
print("wave size:{0}\nmelsp size:{1}\nsamping rate:{2}".format(x_st.shape, melsp.shape, fs))
show_wave(x_st)
show_melsp(melsp, fs)

wave size:(220500,)
melsp size:(128, 1723)
samping rate:44100

在这里插入图片描述
在这里插入图片描述

ipd.Audio(x_st, rate=fs)

Split training dataset and test dataset

# get training dataset and target dataset
x = list(meta_data.loc[:,"filename"])
y = list(meta_data.loc[:, "target"])

x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, test_size=0.2, stratify=y)
print("x train:{0}\ny train:{1}\nx test:{2}\ny test:{3}".format(len(x_train),
                                                                len(y_train), 
                                                                len(x_test), 
                                                                len(y_test)))

x train:1600
y train:1600
x test:400
y test:400

Transform wav data to mel-stft array

freq = 128
time = 1723
# save wave data in npz, with augmentation
def save_np_data(filename, x, y, aug=None, rates=None):
    np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time)
    np_targets = np.zeros(len(y))
    for i in range(len(y)):
        _x, fs = load_wave_data(audio_dir, x[i])
        if aug is not None:
            _x = aug(x=_x, rate=rates[i])
        _x = calculate_melsp(_x)
        np_data[i] = _x
        np_targets[i] = y[i]
    np.savez(filename, x=np_data, y=np_targets)  
# save raw training dataset
if not os.path.exists("esc_melsp_all_train_raw.npz"):
    save_np_data("esc_melsp_all_train_raw.npz", x_train,  y_train)
# save test dataset
if not os.path.exists("esc_melsp_all_test.npz"):
    save_np_data("esc_melsp_all_test.npz", x_test,  y_test)
# save training dataset with white noise
if not os.path.exists("esc_melsp_train_white_noise.npz"):
    rates = np.random.randint(1,50,len(x_train))/10000
    save_np_data("esc_melsp_train_white_noise.npz", x_train,  y_train, aug=add_white_noise, rates=rates)
# save training dataset with sound shift
if not os.path.exists("esc_melsp_train_shift_sound.npz"):
    rates = np.random.choice(np.arange(2,6),len(y_train))
    save_np_data("esc_melsp_train_shift_sound.npz", x_train,  y_train, aug=shift_sound, rates=rates)
# save training dataset with stretch
if not os.path.exists("esc_melsp_train_stretch_sound.npz"):
    rates = np.random.choice(np.arange(80,120),len(y_train))/100
    save_np_data("esc_melsp_train_stretch_sound.npz", x_train,  y_train, aug=stretch_sound, rates=rates)
# save training dataset with combination of white noise and shift or stretch
if not os.path.exists("esc_melsp_train_combination.npz"):
    np_data = np.zeros(freq*time*len(x_train)).reshape(len(x_train), freq, time)
    np_targets = np.zeros(len(y_train))
    for i in range(len(y_train)):
        x, fs = load_wave_data(audio_dir, x_train[i])
        x = add_white_noise(x=x, rate=np.random.randint(1,50)/1000)
        if np.random.choice((True,False)):
            x = shift_sound(x=x, rate=np.random.choice(np.arange(2,6)))
        else:
            x = stretch_sound(x=x, rate=np.random.choice(np.arange(80,120))/100)
        x = calculate_melsp(x)
        np_data[i] = x
        np_targets[i] = y_train[i]
    np.savez("esc_melsp_train_combination.npz", x=np_data, y=np_targets)

Audio classification with deep learning

Preparation for deep learning

import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Activation
from keras.layers import Conv2D, GlobalAveragePooling2D
from keras.layers import BatchNormalization, Add
from keras.callbacks import EarlyStopping, ModelCheckpoint,ReduceLROnPlateau
from keras.models import load_model
import warnings
warnings.filterwarnings("ignore")
# dataset files
train_files = ["esc_melsp_all_train_raw.npz"]
test_file = "esc_melsp_all_test.npz"
train_num = len(x_train)
test_num = len(x_test)
print(train_num)
print(test_num)

1600
400

# define dataset placeholders
x_train = np.zeros(freq*time*train_num*len(train_files)).reshape(train_num*len(train_files), freq, time)
y_train = np.zeros(train_num*len(train_files))
# load dataset
for i in range(len(train_files)):
    data = np.load(train_files[i])
    x_train[i*train_num:(i+1)*train_num] = data["x"]
    y_train[i*train_num:(i+1)*train_num] = data["y"]
# load test dataset
test_data = np.load(test_file)
x_test = test_data["x"]
y_test = test_data["y"]
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1600, 128, 1723)
(1600,)
(400, 128, 1723)
(400,)

# redefine target data into one hot vector
classes = 50
y_train = keras.utils.to_categorical(y_train, classes)
y_test = keras.utils.to_categorical(y_test, classes)
# reshape training dataset
x_train = x_train.reshape(train_num*1, freq, time, 1)
x_test = x_test.reshape(test_num, freq, time, 1)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(1600, 128, 1723, 1)
(1600, 50)
(400, 128, 1723, 1)
(400, 50)

print("x train:{0}\ny train:{1}\nx test:{2}\ny test:{3}".format(x_train.shape,
                                                                y_train.shape, 
                                                                x_test.shape, 
                                                                y_test.shape))
x train:(1600, 128, 1723, 1)
y train:(1600, 50)
x test:(400, 128, 1723, 1)
y test:(400, 50)

Define convolutional neural network

def cba(inputs, filters, kernel_size, strides):
    x = Conv2D(filters, kernel_size=kernel_size, strides=strides, padding='same')(inputs)
    x = BatchNormalization()(x)
    x = Activation("relu")(x)
    return x
# define CNN
inputs = Input(shape=(x_train.shape[1:]))

x_1 = cba(inputs, filters=32, kernel_size=(1,8), strides=(1,2))
x_1 = cba(x_1, filters=32, kernel_size=(8,1), strides=(2,1))
x_1 = cba(x_1, filters=64, kernel_size=(1,8), strides=(1,2))
x_1 = cba(x_1, filters=64, kernel_size=(8,1), strides=(2,1))

x_2 = cba(inputs, filters=32, kernel_size=(1,16), strides=(1,2))
x_2 = cba(x_2, filters=32, kernel_size=(16,1), strides=(2,1))
x_2 = cba(x_2, filters=64, kernel_size=(1,16), strides=(1,2))
x_2 = cba(x_2, filters=64, kernel_size=(16,1), strides=(2,1))

x_3 = cba(inputs, filters=32, kernel_size=(1,32), strides=(1,2))
x_3 = cba(x_3, filters=32, kernel_size=(32,1), strides=(2,1))
x_3 = cba(x_3, filters=64, kernel_size=(1,32), strides=(1,2))
x_3 = cba(x_3, filters=64, kernel_size=(32,1), strides=(2,1))

x_4 = cba(inputs, filters=32, kernel_size=(1,64), strides=(1,2))
x_4 = cba(x_4, filters=32, kernel_size=(64,1), strides=(2,1))
x_4 = cba(x_4, filters=64, kernel_size=(1,64), strides=(1,2))
x_4 = cba(x_4, filters=64, kernel_size=(64,1), strides=(2,1))

x = Add()([x_1, x_2, x_3, x_4])

x = cba(x, filters=128, kernel_size=(1,16), strides=(1,2))
x = cba(x, filters=128, kernel_size=(16,1), strides=(2,1))

x = GlobalAveragePooling2D()(x)
x = Dense(classes)(x)
x = Activation("softmax")(x)

model = Model(inputs, x)
model.summary()

Optimization and callbacks

# initiate Adam optimizer
opt = keras.optimizers.adam(lr=0.0001, decay=1e-6, amsgrad=True)

# Let's train the model using Adam with amsgrad
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])
# directory for model checkpoints
model_dir = "./models"
if not os.path.exists(model_dir):
    os.mkdir(model_dir)

# early stopping and model checkpoint# early  
es_cb = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True,verbose=1, mode='min')
chkpt = os.path.join(model_dir, 'esc50_.{epoch:02d}_{val_loss:.4f}_{val_acc:.4f}.hdf5')
cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
reduce_lr = ReduceLROnPlateau(monitor="val_loss",factor=0.6,patience=3,verbose=1,mode="min")

Train CNN model with between class dataset

# between class data generator
class MixupGenerator():
    def __init__(self, x_train, y_train, batch_size=16, alpha=0.2, shuffle=True):
        self.x_train = x_train
        self.y_train = y_train
        self.batch_size = batch_size
        self.alpha = alpha
        self.shuffle = shuffle
        self.sample_num = len(x_train)

    def __call__(self):
        while True:
            indexes = self.__get_exploration_order()
            itr_num = int(len(indexes) // (self.batch_size * 2))

            for i in range(itr_num):
                batch_ids = indexes[i * self.batch_size * 2:(i + 1) * self.batch_size * 2]
                x, y = self.__data_generation(batch_ids)

                yield x, y

    def __get_exploration_order(self):
        indexes = np.arange(self.sample_num)

        if self.shuffle:
            np.random.shuffle(indexes)

        return indexes

    def __data_generation(self, batch_ids):
        _, h, w, c = self.x_train.shape
        _, class_num = self.y_train.shape
        x1 = self.x_train[batch_ids[:self.batch_size]]
        x2 = self.x_train[batch_ids[self.batch_size:]]
        y1 = self.y_train[batch_ids[:self.batch_size]]
        y2 = self.y_train[batch_ids[self.batch_size:]]
        l = np.random.beta(self.alpha, self.alpha, self.batch_size)
        x_l = l.reshape(self.batch_size, 1, 1, 1)
        y_l = l.reshape(self.batch_size, 1)

        x = x1 * x_l + x2 * (1 - x_l)
        y = y1 * y_l + y2 * (1 - y_l)

        return x, y
# train model
batch_size = 16
epochs = 1000

training_generator = MixupGenerator(x_train, y_train)()
model.fit_generator(generator=training_generator,
                    steps_per_epoch=x_train.shape[0] // batch_size,
                    validation_data=(x_test, y_test),
                    epochs=epochs, 
                    verbose=1,
                    shuffle=True,
                    callbacks=[es_cb, cp_cb,reduce_lr])

在这里插入图片描述

evaluation = model.evaluate(x_test, y_test)

evaluation = model.evaluate(x_test, y_test)
print(evaluation)

320/320 [==============================] - 48s 151ms/step
[1.4442142248153687, 0.675]

prediction

freq = 128
time = 1723
predict_audio_dir = os.path.join(esc_dir, "audio/test/")
predict_file = os.path.join(esc_dir,"test.csv")
predict_data = pd.read_csv("test.csv",header=None,names=["filename"])
predict_data

在这里插入图片描述

predict_data.shape[0]

400

predict = list(predict_data.loc[:,"filename"])
# save wave data in npz, with augmentation
def save_np_data(filename, x, aug=None, rates=None):
    np_data = np.zeros(freq*time*len(x)).reshape(len(x), freq, time)
    for i in range(len(predict)):
        _x, fs = load_wave_data(predict_audio_dir, x[i])
        if aug is not None:
            _x = aug(x=_x, rate=rates[i])
        _x = calculate_melsp(_x)
        np_data[i] = _x
    np.savez(filename, x=np_data)  
# save raw training dataset
if not os.path.exists("esc_melsp_predict_raw.npz"):
    save_np_data("esc_melsp_predict_raw.npz", predict)
predict_file = "esc_melsp_predict_raw.npz"
# load test dataset
predict = np.load(predict_file)
x_predict = predict["x"]
x_predict = x_predict.reshape(predict_data.shape[0],freq,time,1)
pred = None
for model_path in ["models/esc50_.14_1.3803_0.7312.hdf5","models/esc50_.18_1.2065_0.7000.hdf5","models/esc50_.20_1.1664_0.7594.hdf5"]:
    model = load_model(model_path)
    if pred is None:
        pred = model.predict(x_predict)
    else:
        pred += model.predict(x_predict)
print(pred.shape)
res = np.argmax(pred,axis=1)
print(res[:5])
import pandas as pd

df = pd.DataFrame({"img_path":predict_data["filename"], "tags":res})
df.to_csv("submit.csv",index=None,header=None)

小结

下次见!!!
在这里插入图片描述

评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

mind_programmonkey

你的鼓励是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值