问题描述
speech recognize是kaggle上1,2年前的一个赛题,主要描述的是如果在各种环境中识别出简单的英文单词发音,比如bed,cat,right之类的语音。提供的训练集是.wav格式的语音,解压后数据集大小在2G左右。
训练集分析
语音识别相对图片识别来说,是一个区别较大的领域。所以首先要了解训练集的基本特征,理清不同语音之间的共性。
振幅和频率
def log_specgram(audio, sample_rate, window_size=20,
step_size=10, eps=1e-10):
nperseg = int(round(window_size * sample_rate / 1e3))
noverlap = int(round(step_size * sample_rate / 1e3))
freqs, times, spec = signal.spectrogram(audio,
fs=sample_rate,
window='hann',
nperseg=nperseg,
noverlap=noverlap,
detrend=False)
return freqs, times, np.log(spec.T.astype(np.float32) + eps)
def plt_specgram(freqs, times, spectrogram):
fig = plt.figure(figsize=(14, 8))
ax1 = fig.add_subplot(211)
ax1.set_title('Raw wave of ' + filename)
ax1.set_ylabel('Amplitude')
ax1.plot(np.linspace(0, sample_rate / len(samples), sample_rate), samples)
ax2 = fig.add_subplot(212)
ax2.imshow(spectrogram.T, aspect='auto', origin='lower',
extent=[times.min(), times.max(), freqs.min(), freqs.max()])
ax2.set_yticks(freqs[::16])
ax2.set_xticks(times[::16])
ax2.set_title('Spectrogram of ' + filename)
ax2.set_ylabel('Freqs in Hz')
ax2.set_xlabel('Seconds')
plt.savefig('./output/yes_0a7c2a8d_nohash_0.png')
plt.show()
train_audio_path = './input/train/audio/'
filename = '/yes/0a7c2a8d_nohash_0.wav'
sample_rate, samples = wavfile.read(str(train_audio_path) + filename)
# print(sample_rate, samples)
# 频谱图
freqs, times, spectrogram = log_specgram(samples, sample_rate)
plt_specgram(freqs, times, spectrogram)
声音强度
def plt_spectrogram():
plt.figure(figsize=(12, 4))
librosa.display.specshow(log_S, sr=sample_rate, x_axis='time', y_axis='mel')
plt.title('Mel power spectrogram ')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
plt.savefig('./output/spectrogram.png')
plt.show()
# 光谱图
sig = samples
sig = sig / max(abs(sig))
S = librosa.feature.melspectrogram(y=sig, sr=sample_rate, n_mels=128)
log_S = librosa.power_to_db(S, ref=np.max)
plt_spectrogram()