@TOC
RNN(GRU)自编码器
用GRU做自编码器,复原语音。就是将语音信号输入到RNN里,进行编码,然后再解码。
样本链接,提取码:vive
import keras
import numpy as np
import librosa
import librosa.display
import os
import os.path
import random
import time
import sys
import matplotlib.pyplot as plt
font = {'family' : 'SimHei',
'size' : '15'}
plt.rc('font', **font) # 步骤一(设置字体的更多属性)
plt.rc('axes', unicode_minus=False) # 步骤二(解决坐标轴负数的负号显示问题)
Using TensorFlow backend.
#获取音频文件路径
def get_wav_path(filePath):
wavPath = []
files = os.listdir(filePath)
for file in files:
wav = os.listdir(filePath+"/"+file)
for j in range(len(wav)):
fileType = wav[j].split(".")[1]
if fileType=="wav":
wavPath.append(filePath+"/"+file+"/"+wav[j])
return wavPath
#获取3秒时间序列
def get_data(wav_files,second=3,sampleRate=16000):
"""
:param wav_files:测试文件
:param second: 采样时间
:param sampleRate:采样率
:return
"""
#保存序列
x=[]
begin_time = time.time()
print('开始预处理文件')
for index, wav in enumerate(wav_files):
#根据采样率获取音频序列和采样率
signal, srate = librosa.load(wav, sr=sampleRate)
if len(signal) <second*srate:
continue
#
#超过3秒取三秒
if len(signal) >= second * srate:
signal = signal[0:int(second * srate)]
#
else:
#不足3秒补0
signal = signal.tolist()
for j in range(second * srate - len(signal)):
signal.append(0)
signal = np.array(signal)
x.append(signal)
gaptime = time.time() - begin_time
percent = float(index+1) * 100 / len(wav_files)
eta_time = gaptime * 100 / (percent + 0.01) - gaptime
strprogress = "[" + "=" * int(percent // 2) + ">" + "-" * int(50 - percent // 2) + "]"
str_log = ("%.2f %% %s %s/%s \t 已用时间:%ds 剩余时间:%d s" % (percent,strprogress,index+1,len(wav_files),gaptime,eta_time))
sys.stdout.write('\r' + str_log)
x= np.array(x)
return x
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras.callbacks import ModelCheckpoint,TensorBoard
from keras.models import load_model
filePath = "rnn_wavs/train"
wavPath = get_wav_path(filePath)
x=get_data(wavPath)
开始预处理文件
100.00 % [==================================================>] 50/50 已用时间:2s 剩余时间:0 s
print(x.shape)
print(x[0])
(50, 48000)
[-0.0090332 -0.01293945 -0.01196289 ... -0.03439331 -0.0390625
-0.03811646]
#显示波形图
def show_wave(x,title='waveplot'):
librosa.display.waveplot(x , sr=16000)
plt.title(title)
show_wave(x[0])
from keras import Model
from keras.layers import Dense,Input,CuDNNGRU
from keras.callbacks import TensorBoard
#将一个音频序列切分成N个向量,每个向量暂时定为100维,就是输入到RNN里的,可以看成每个音频是一个句,每个句子里有多个单词,每个单词都是100维
seq_vector=100
x_train=x.reshape(x.shape[0],-1,seq_vector)
print(x_train.shape)
print(x_train[0])
(50, 480, 100)
[[-0.0090332 -0.01293945 -0.01196289 ... -0.01239014 -0.01223755
-0.01193237]
[-0.01205444 -0.01248169 -0.01263428 ... -0.01290894 -0.0128479
-0.01260376]
[-0.01263428 -0.01239014 -0.01248169 ... -0.01202393 -0.01217651
-0.01254272]
...
[-0.0100708 -0.00872803 -0.00650024 ... -0.01254272 -0.01089478
-0.01132202]
[-0.01348877 -0.00967407 -0.00897217 ... -0.01647949 -0.01019287
-0.00961304]
[-0.0055542 -0.00222778 0.00445557 ... -0.03439331 -0.0390625
-0.03811646]]
#函数式要定义输入
input_image=Input((None,seq_vector))
#编码器 用CuDNNGRU可以用GPU,当然也可以用GRU
encoder=CuDNNGRU(units=512, return_sequences=True, name="gru1")(input_image)
encoder=CuDNNGRU(units=256, return_sequences=True,name="gru2")(encoder)
encoder=Dense(256,activation='tanh')(encoder)
encoder_out=Dense(32,activation='tanh')(encoder)
encoder_model = Model(inputs=input_image, outputs=encoder_out)
#解码器,反过来
decoder=Dense(256,activation='tanh')(encoder_out)
decoder=CuDNNGRU(units=256, return_sequences=True,name="de_gru1")(decoder)
decoder=CuDNNGRU(units=512, return_sequences=True, name="de_gru2")(decoder)
decoder_out=Dense(seq_vector,activation='tanh')(decoder)
autoencoder=Model(input_image,decoder_out)
autoencoder.summary()
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
input_4 (InputLayer) (None, None, 100) 0
_________________________________________________________________
gru1 (CuDNNGRU) (None, None, 512) 943104
_________________________________________________________________
gru2 (CuDNNGRU) (None, None, 256) 591360
_________________________________________________________________
dense_18 (Dense) (None, None, 256) 65792
_________________________________________________________________
dense_19 (Dense) (None, None, 32) 8224
_________________________________________________________________
dense_20 (Dense) (None, None, 256) 8448
_________________________________________________________________
de_gru1 (CuDNNGRU) (None, None, 256) 394752
_________________________________________________________________
de_gru2 (CuDNNGRU) (None, None, 512) 1182720
_________________________________________________________________
dense_21 (Dense) (None, None, 100) 51300
=================================================================
Total params: 3,245,700
Trainable params: 3,245,700
Non-trainable params: 0
_________________________________________________________________
autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(x_train, x_train, epochs=50, batch_size=2,verbose=1,
callbacks=[TensorBoard(log_dir='log_RNN声音自编码器'),
ModelCheckpoint("rnn.h5",
monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)])
Epoch 1/50
50/50 [==============================] - 5s 103ms/step - loss: 0.0060
Epoch 00001: loss improved from inf to 0.00604, saving model to rnn.h5
Epoch 2/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0053
Epoch 00002: loss improved from 0.00604 to 0.00532, saving model to rnn.h5
Epoch 3/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0039
Epoch 00003: loss improved from 0.00532 to 0.00385, saving model to rnn.h5
Epoch 4/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0028
Epoch 00004: loss improved from 0.00385 to 0.00277, saving model to rnn.h5
Epoch 5/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0021
Epoch 00005: loss improved from 0.00277 to 0.00211, saving model to rnn.h5
Epoch 6/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0015
Epoch 00006: loss improved from 0.00211 to 0.00148, saving model to rnn.h5
Epoch 7/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0012
Epoch 00007: loss improved from 0.00148 to 0.00117, saving model to rnn.h5
Epoch 8/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0010
Epoch 00008: loss improved from 0.00117 to 0.00102, saving model to rnn.h5
Epoch 9/50
50/50 [==============================] - 4s 85ms/step - loss: 9.1125e-04
Epoch 00009: loss improved from 0.00102 to 0.00091, saving model to rnn.h5
Epoch 10/50
50/50 [==============================] - 4s 85ms/step - loss: 8.4070e-04
Epoch 00010: loss improved from 0.00091 to 0.00084, saving model to rnn.h5
Epoch 11/50
50/50 [==============================] - 4s 85ms/step - loss: 7.7816e-04
Epoch 00011: loss improved from 0.00084 to 0.00078, saving model to rnn.h5
Epoch 12/50
50/50 [==============================] - 4s 85ms/step - loss: 6.8798e-04
Epoch 00012: loss improved from 0.00078 to 0.00069, saving model to rnn.h5
Epoch 13/50
50/50 [==============================] - 4s 85ms/step - loss: 6.0970e-04
Epoch 00013: loss improved from 0.00069 to 0.00061, saving model to rnn.h5
Epoch 14/50
50/50 [==============================] - 4s 85ms/step - loss: 5.6702e-04
Epoch 00014: loss improved from 0.00061 to 0.00057, saving model to rnn.h5
Epoch 15/50
50/50 [==============================] - 4s 85ms/step - loss: 5.2083e-04
Epoch 00015: loss improved from 0.00057 to 0.00052, saving model to rnn.h5
Epoch 16/50
50/50 [==============================] - 4s 85ms/step - loss: 4.7999e-04
Epoch 00016: loss improved from 0.00052 to 0.00048, saving model to rnn.h5
Epoch 17/50
50/50 [==============================] - 4s 85ms/step - loss: 4.6281e-04
Epoch 00017: loss improved from 0.00048 to 0.00046, saving model to rnn.h5
Epoch 18/50
50/50 [==============================] - 4s 85ms/step - loss: 4.4957e-04
Epoch 00018: loss improved from 0.00046 to 0.00045, saving model to rnn.h5
Epoch 19/50
50/50 [==============================] - 4s 85ms/step - loss: 4.5229e-04
Epoch 00019: loss did not improve
Epoch 20/50
50/50 [==============================] - 4s 85ms/step - loss: 4.4043e-04
Epoch 00020: loss improved from 0.00045 to 0.00044, saving model to rnn.h5
Epoch 21/50
50/50 [==============================] - 4s 85ms/step - loss: 4.2521e-04: 1s - loss: 4.07
Epoch 00021: loss improved from 0.00044 to 0.00043, saving model to rnn.h5
Epoch 22/50
50/50 [==============================] - 4s 85ms/step - loss: 4.1621e-04
Epoch 00022: loss improved from 0.00043 to 0.00042, saving model to rnn.h5
Epoch 23/50
50/50 [==============================] - 4s 85ms/step - loss: 4.0283e-04
Epoch 00023: loss improved from 0.00042 to 0.00040, saving model to rnn.h5
Epoch 24/50
50/50 [==============================] - 4s 85ms/step - loss: 3.9148e-04
Epoch 00024: loss improved from 0.00040 to 0.00039, saving model to rnn.h5
Epoch 25/50
50/50 [==============================] - 4s 85ms/step - loss: 3.9139e-04
Epoch 00025: loss improved from 0.00039 to 0.00039, saving model to rnn.h5
Epoch 26/50
50/50 [==============================] - 4s 85ms/step - loss: 3.7617e-04
Epoch 00026: loss improved from 0.00039 to 0.00038, saving model to rnn.h5
Epoch 27/50
50/50 [==============================] - 4s 85ms/step - loss: 3.7279e-04
Epoch 00027: loss improved from 0.00038 to 0.00037, saving model to rnn.h5
Epoch 28/50
50/50 [==============================] - 4s 85ms/step - loss: 3.5802e-04
Epoch 00028: loss improved from 0.00037 to 0.00036, saving model to rnn.h5
Epoch 29/50
50/50 [==============================] - 4s 85ms/step - loss: 3.4530e-04
Epoch 00029: loss improved from 0.00036 to 0.00035, saving model to rnn.h5
Epoch 30/50
50/50 [==============================] - 4s 85ms/step - loss: 3.5566e-04
Epoch 00030: loss did not improve
Epoch 31/50
50/50 [==============================] - 4s 85ms/step - loss: 3.4619e-04
Epoch 00031: loss did not improve
Epoch 32/50
50/50 [==============================] - 4s 85ms/step - loss: 3.3743e-04
Epoch 00032: loss improved from 0.00035 to 0.00034, saving model to rnn.h5
Epoch 33/50
50/50 [==============================] - 4s 85ms/step - loss: 3.2660e-04
Epoch 00033: loss improved from 0.00034 to 0.00033, saving model to rnn.h5
Epoch 34/50
50/50 [==============================] - 4s 85ms/step - loss: 3.1028e-04
Epoch 00034: loss improved from 0.00033 to 0.00031, saving model to rnn.h5
Epoch 35/50
50/50 [==============================] - 4s 85ms/step - loss: 2.9729e-04
Epoch 00035: loss improved from 0.00031 to 0.00030, saving model to rnn.h5
Epoch 36/50
50/50 [==============================] - 4s 85ms/step - loss: 2.9330e-04
Epoch 00036: loss improved from 0.00030 to 0.00029, saving model to rnn.h5
Epoch 37/50
50/50 [==============================] - 4s 85ms/step - loss: 2.9591e-04
Epoch 00037: loss did not improve
Epoch 38/50
50/50 [==============================] - 4s 85ms/step - loss: 2.8852e-04
Epoch 00038: loss improved from 0.00029 to 0.00029, saving model to rnn.h5
Epoch 39/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7403e-04
Epoch 00039: loss improved from 0.00029 to 0.00027, saving model to rnn.h5
Epoch 40/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7536e-04
Epoch 00040: loss did not improve
Epoch 41/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7173e-04
Epoch 00041: loss improved from 0.00027 to 0.00027, saving model to rnn.h5
Epoch 42/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6409e-04
Epoch 00042: loss improved from 0.00027 to 0.00026, saving model to rnn.h5
Epoch 43/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6014e-04
Epoch 00043: loss improved from 0.00026 to 0.00026, saving model to rnn.h5
Epoch 44/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5614e-04
Epoch 00044: loss improved from 0.00026 to 0.00026, saving model to rnn.h5
Epoch 45/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5297e-04
Epoch 00045: loss improved from 0.00026 to 0.00025, saving model to rnn.h5
Epoch 46/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5371e-04
Epoch 00046: loss did not improve
Epoch 47/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5559e-04
Epoch 00047: loss did not improve
Epoch 48/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6840e-04
Epoch 00048: loss did not improve
Epoch 49/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7194e-04
Epoch 00049: loss did not improve
Epoch 50/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6571e-04
Epoch 00050: loss did not improve
<keras.callbacks.History at 0x7380fc88>
filePath = "rnn_wavs/test"
testPath = get_wav_path(filePath)
x_test_ori=get_data(testPath)
x_test=x_test_ori.reshape(x_test_ori.shape[0],-1,seq_vector)
开始预处理文件
100.00 % [==================================================>] 4/4 已用时间:0s 剩余时间:0 s
decoded_seq = autoencoder.predict(x_test)
print(decoded_seq.shape)
(4, 480, 100)
d_shape=decoded_seq[0].shape
print(d_shape)
(480, 100)
decoded_seq[0]
array([[-0.01014584, -0.01277211, -0.01327355, ..., -0.01386975,
-0.0145765 , -0.01456202],
[-0.01393538, -0.01367248, -0.01350442, ..., -0.01421544,
-0.01455196, -0.0140023 ],
[-0.01335981, -0.01219707, -0.01206582, ..., -0.0150555 ,
-0.0149645 , -0.01383976],
...,
[-0.09125455, -0.10162869, -0.12252352, ..., -0.01989237,
-0.01427485, 0.01067922],
[-0.00659992, -0.00485723, -0.0141265 , ..., 0.26765165,
0.37300578, 0.42874205],
[ 0.5345399 , 0.59415686, 0.611799 , ..., 0.20478293,
0.2866281 , 0.3082694 ]], dtype=float32)
index=0
seq=decoded_seq[index].flatten()
show_wave(seq,'rnn 复原')
show_wave(x_test_ori[index],'原始')
#获取特征向量,可转成特征图片
encoded_latent = encoder_model.predict(x_test)
print(encoded_latent.shape)
(4, 480, 32)
#特征图片
def show_latent_images(start=0,end=4):
plt.figure(figsize=(20, 10))
for i in range(start,end):
ax = plt.subplot(2,end, i+1)
plt.imshow(encoded_latent[i].reshape(160, -1).T,cmap='binary')
plt.show()
show_latent_images()
testPath[0]
'rnn_wavs/test/A2/A2_239.wav'
#播放音频rnn_wavs/test/A2/A2_239.wav
import IPython
out_wav=testPath[0]
signal, srate = librosa.load(out_wav, sr=16000)
IPython.display.Audio(out_wav)
#取3秒
signal=signal[0:3*16000]
#播放前3秒的
out_new='3s.wav'
librosa.output.write_wav(out_new,signal,16000)
IPython.display.Audio(out_new)
#播放复原的3秒音频
out_new='new_3s.wav'
librosa.output.write_wav(out_new,seq,16000)
IPython.display.Audio(out_new)
好了,今天就到这里了,希望对学习理解有帮助,大神看见勿喷,仅为自己的学习理解,能力有限,请多包涵,侵删。