项目来源于Speech-Emotion-Classification-with-PyTorch。使用模型Parallel 2D CNN - Trnasformer Eencoder在数据集RAVDESS上,实现了96.78%的准确率。
一、Load the data
数据集RAVDESS包含了1440个音频文件(16bits,48kHz,.wav)。由北美的24个演员(12名男性和12名女性)连续说两句词性相匹配的话构成。(Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").)
数据集分布均衡。
每个文件都有一个单独的文件名。文件名由7部分数字标号组成((e.g., 03-01-06-01-02-01-12.wav))。Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised)。Emotion有两个强度:01=normal,02=strong(neutral类除外,它只有normal强度)。Actor (01 to 24),奇数是男性,偶数是女性。
import numpy as np
import pandas as pd
import os
import librosa
import librosa.display
import IPython
from IPython.display import Audio
from IPython.display import Image
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
EMOTIONS = {1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 0:'surprise'} # surprise je promenjen sa 8 na 0
DATA_PATH = '../archive/audio_speech_actors_01-24/'
SAMPLE_RATE = 48000
data = pd.DataFrame(columns=['Emotion', 'Emotion intensity', 'Gender','Path'])
for dirpath, _, filenames in os.walk(DATA_PATH): #os.walk()返回(dirpath,dirnames,filenames),即(起始路径,起始路径下的文件夹,起始路径文件夹下的文件)
for filename in filenames:
file_path = os.path.join(dirpath, filename)
identifiers = filename.split('.')[0].split('-') #filename: 03-01-01-01-01-01-01.wav
# print("identifiers:",identifiers) # identifiers:['03', '01', '01', '01', '01', '01', '01']
emotion = (int(identifiers[2]))
if emotion == 8: # promeni surprise sa 8 na 0
emotion = 0
if int(identifiers[3]) == 1:
emotion_intensity = 'normal'
else:
emotion_intensity = 'strong'
if int(identifiers[6])%2 == 0:
gender = 'female'
else:
gender = 'male'
data = data.append({"Emotion": emotion,
"Emotion intensity": emotion_intensity,
"Gender": gender,
"Path": file_path
},
ignore_index = True
)
print("number of files is {}".format(len(data)))
data.head()
Load the signals
Signals are loaded with sample rate of 48kHz and cut off to be in the range of [0.5, 3] seconds. If the signal is shorter than 3s it is padded
mel_spectrograms = []
signals = []
for i, file_path in enumerate(data.Path):
audio, sample_rate = librosa.load(file_path, duration=3, offset=0.5, sr=SAMPLE_RATE)
signal = np.zeros((int(SAMPLE_RATE*3,)))
# signal= audio #stack时会报错ValueError: all input arrays must have the same shape,因为signal长度不一致
# print("len_signal",len(signal))
signal[:len(audio)] = audio
signals.append(signal)
print("\r Processed {}/{} files".format(i,len(data)),end='')
# print("signals_before_stack:",signals)
signals = np.stack(signals,axis=0)
# print("signals_after_stack:",signals)
Split the data
Dataset is splitted into train, validation and test sets, with following percentage: (80,10,10)%.
X = signals
train_ind,test_ind,val_ind = [],[],[]
X_train,X_val,X_test = [],[],[]
Y_train,Y_val,Y_test = [],[],[]
for emotion in range(len(EMOTIONS)):
emotion_ind = list(data.loc[data.Emotion==emotion,'Emotion'].index) #提取情感类别为emotion的索引
emotion_ind = np.random.permutation(emotion_ind) #将索引打乱,以便划分数据集。permunation返回打乱后的新数组
m = len(emotion_ind)
ind_train = emotion_ind[:int(0.8*m)]
ind_val = emotion_ind[int(0.8*m):int(0.9*m)]
ind_test = emotion_ind[int(0.9*m):]
X_train.append(X[ind_train,:])
Y_train.append(np.array([emotion]*len(ind_train),dtype=np.int32))
X_val.append(X[ind_val,:])
Y_val.append(np.array([emotion]*len(ind_val),dtype=np.int32))
X_test.append(X[ind_test,:])
Y_test.append(np.array([emotion]*len(ind_test),dtype=np.int32))
train_ind.append(ind_train)
test_ind.append(ind_test)
val_ind.append(ind_val)
X_train = np.concatenate(X_train,0)
X_val = np.concatenate(X_val,0)
X_test = np.concatenate(X_test,0)
Y_train = np.concatenate(Y_train,0)
Y_val = np.concatenate(Y_val,0)
Y_test = np.concatenate(Y_test,0)
train_ind = np.concatenate(train_ind,0)
val_ind = np.concatenate(val_ind,0)
test_ind = np.concatenate(test_ind,0)
print(f'X_train:{X_train.shape}, Y_train:{Y_train.shape}')
print(f'X_val:{X_val.shape}, Y_val:{Y_val.shape}')
print(f'X_test:{X_test.shape}, Y_test:{Y_test.shape}')
# check if all are unique
unique, count = np.unique(np.concatenate([train_ind,test_ind,val_ind],0), return_counts=True)
print("Number of unique indexes is {}, out of {}".format(sum(count==1), X.shape[0]))
del X
Augment signals by adding AWGN
Data augmentation is performed by adding Additive White Gaussian Noise (with SNR in range [15,30]) on the original signal. This enormously improved accuracy and removed overfitting.
def addAWGN(signal, num_bits=16, augmented_num=2, snr_low=15, snr_high=30):
signal_len = len(signal)
# Generate White Gaussian noise
noise = np.random.normal(size=(augmented_num, signal_len))
# Normalize signal and noise
norm_constant = 2.0**(num_bits-1)
signal_norm = signal / norm_constant
noise_norm = noise / norm_constant
# Compute signal and noise power
s_power = np.sum(signal_norm ** 2) / signal_len
n_power = np.sum(noise_norm ** 2, axis=1) / signal_len
# Random SNR: Uniform [15, 30] in dB
target_snr = np.random.randint(snr_low, snr_high)
# Compute K (covariance matrix) for each noise
K = np.sqrt((s_power / n_power) * 10 ** (- target_snr / 10))
K = np.ones((signal_len, augmented_num)) * K
# Generate noisy signal
return signal + K.T * noise
aug_signals = []
aug_labels = []
for i in range(X_train.shape[0]):
signal = X_train[i,:]
augmented_signals = addAWGN(signal)
for j in range(augmented_signals.shape[0]):
aug_labels.append(data.loc[i,"Emotion"])
aug_signals.append(augmented_signals[j,:])
data = data.append(data.iloc[i], ignore_index=True)
print("\r Processed {}/{} files".format(i,X_train.shape[0]),end='')
aug_signals = np.stack(aug_signals,axis=0)
X_train = np.concatenate([X_train,aug_signals],axis=0)
aug_labels = np.stack(aug_labels,axis=0)
Y_train = np.concatenate([Y_train,aug_labels])
print('')
print(f'X_train:{X_train.shape}, Y_train:{Y_train.shape}')
Calculate mel spectrograms
MEL spectrogram is calculated and used as an input for the models (for the 1st and 2nd model the spectrogram is splitted into 7 chunks).
Example of the MEL spectrogram:
def getMELspectrogram(audio, sample_rate):
mel_spec = librosa.feature.melspectrogram(y=audio,
sr=sample_rate,
n_fft=1024,
win_length = 512,
window='hamming',
hop_length = 256, #hop_length两个window之间的跳跃间隔
n_mels=128,
fmax=sample_rate/2
)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
return mel_spec_db
# test function
audio, sample_rate = librosa.load(data.loc[0,'Path'], duration=3, offset=0.5,sr=SAMPLE_RATE)
signal = np.zeros((int(SAMPLE_RATE*3,)))
signal[:len(audio)] = audio
mel_spectrogram = getMELspectrogram(signal, SAMPLE_RATE)
librosa.display.specshow(mel_spectrogram, y_axis='mel', x_axis='time')
print('MEL spectrogram shape: ',mel_spectrogram.shape)
create the model
import torch
import torch.nn as nn
class ParallelModel(nn.Module):
def __init__(self,num_emotions):
super().__init__()
# conv block
self.conv2Dblock = nn.Sequential(
# 1. conv block
nn.Conv2d(in_channels=1,
out_channels=16,
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2),
nn.Dropout(p=0.3),
# 2. conv block
nn.Conv2d(in_channels=16,
out_channels=32,
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4),
nn.Dropout(p=0.3),
# 3. conv block
nn.Conv2d(in_channels=32,
out_channels=64,
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4),
nn.Dropout(p=0.3),
# 4. conv block
nn.Conv2d(in_channels=64,
out_channels=64,
kernel_size=3,
stride=1,
padding=1
),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=4, stride=4),
nn.Dropout(p=0.3)
)
# Transformer block
self.transf_maxpool = nn.MaxPool2d(kernel_size=[2,4], stride=[2,4])
transf_layer = nn.TransformerEncoderLayer(d_model=64, nhead=4, dim_feedforward=512, dropout=0.4, activation='relu')
self.transf_encoder = nn.TransformerEncoder(transf_layer, num_layers=4)
# Linear softmax layer
self.out_linear = nn.Linear(320,num_emotions)
self.dropout_linear = nn.Dropout(p=0)
self.out_softmax = nn.Softmax(dim=1)
def forward(self,x):
# conv embedding
conv_embedding = self.conv2Dblock(x) #(b,channel,freq,time)
conv_embedding = torch.flatten(conv_embedding, start_dim=1) # do not flatten batch dimension
# transformer embedding
x_reduced = self.transf_maxpool(x)
x_reduced = torch.squeeze(x_reduced,1)
x_reduced = x_reduced.permute(2,0,1) # requires shape = (time,batch,embedding)
transf_out = self.transf_encoder(x_reduced)
transf_embedding = torch.mean(transf_out, dim=0)
# concatenate
complete_embedding = torch.cat([conv_embedding, transf_embedding], dim=1)
# final Linear
output_logits = self.out_linear(complete_embedding)
output_logits = self.dropout_linear(output_logits)
output_softmax = self.out_softmax(output_logits)
return output_logits, output_softmax