keras实战-入门之RNN(GRU)自编码器

最新推荐文章于 2025-03-28 16:34:47 发布

王伟王胖胖

最新推荐文章于 2025-03-28 16:34:47 发布

阅读量5k

点赞数 3

分类专栏： keras实战深度学习文章标签： keras 深度学习自编码器 RNN RNN自编码器

本文链接：https://blog.csdn.net/wangwei19871103/article/details/95108720

版权

深度学习同时被 2 个专栏收录

75 篇文章

订阅专栏

keras实战

12 篇文章

订阅专栏

@TOC

RNN(GRU)自编码器

用GRU做自编码器，复原语音。就是将语音信号输入到RNN里，进行编码，然后再解码。
样本链接，提取码：vive

import keras
import numpy as np
import librosa
import librosa.display
import os
import os.path
import random
import time
import sys
import matplotlib.pyplot as plt
font = {'family' : 'SimHei',
        'size'   : '15'}
plt.rc('font', **font)               # 步骤一（设置字体的更多属性）
plt.rc('axes', unicode_minus=False)  # 步骤二（解决坐标轴负数的负号显示问题）

Using TensorFlow backend.

#获取音频文件路径
def get_wav_path(filePath):
    wavPath = []
    files = os.listdir(filePath)
    for file in files:
        wav = os.listdir(filePath+"/"+file)
        for j in range(len(wav)):
            fileType = wav[j].split(".")[1]
            if fileType=="wav":
                wavPath.append(filePath+"/"+file+"/"+wav[j])
    return wavPath

#获取3秒时间序列
def get_data(wav_files,second=3,sampleRate=16000):
    """
    :param wav_files:测试文件
    :param second: 采样时间
    :param sampleRate:采样率
    :return
    """
    #保存序列
    x=[]
    begin_time = time.time()
    print('开始预处理文件')
    for index, wav in enumerate(wav_files):
        #根据采样率获取音频序列和采样率
        signal, srate = librosa.load(wav, sr=sampleRate)
        if len(signal) <second*srate:
            continue
        

        # 
        #超过3秒取三秒
        if len(signal) >= second * srate:
            signal = signal[0:int(second * srate)]
        #
        else:
            #不足3秒补0
            signal = signal.tolist()
            for j in range(second * srate - len(signal)):
                signal.append(0)
            signal = np.array(signal)
        x.append(signal)
        gaptime = time.time() - begin_time
        percent = float(index+1) * 100 / len(wav_files)
        eta_time = gaptime * 100 / (percent + 0.01) - gaptime
        strprogress = "[" + "=" * int(percent // 2) + ">" + "-" * int(50 - percent // 2) + "]"
        
        str_log = ("%.2f %% %s %s/%s \t 已用时间:%ds 剩余时间:%d s" % (percent,strprogress,index+1,len(wav_files),gaptime,eta_time))
        sys.stdout.write('\r' + str_log)

    x= np.array(x)

    return x

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD, Adam
from keras.models import Model
from keras.callbacks import ModelCheckpoint,TensorBoard
from keras.models import load_model

filePath = "rnn_wavs/train"
wavPath = get_wav_path(filePath)

x=get_data(wavPath)

开始预处理文件
100.00 % [==================================================>] 50/50 	 已用时间:2s 剩余时间:0 s

print(x.shape)
print(x[0])

(50, 48000)
[-0.0090332  -0.01293945 -0.01196289 ... -0.03439331 -0.0390625
 -0.03811646]

#显示波形图
def show_wave(x,title='waveplot'):
    librosa.display.waveplot(x , sr=16000)
    plt.title(title)

show_wave(x[0])

在这里插入图片描述

from keras import Model
from keras.layers import Dense,Input,CuDNNGRU
from keras.callbacks import TensorBoard

#将一个音频序列切分成N个向量，每个向量暂时定为100维，就是输入到RNN里的，可以看成每个音频是一个句，每个句子里有多个单词，每个单词都是100维
seq_vector=100
x_train=x.reshape(x.shape[0],-1,seq_vector)
print(x_train.shape)
print(x_train[0])

(50, 480, 100)
[[-0.0090332  -0.01293945 -0.01196289 ... -0.01239014 -0.01223755
  -0.01193237]
 [-0.01205444 -0.01248169 -0.01263428 ... -0.01290894 -0.0128479
  -0.01260376]
 [-0.01263428 -0.01239014 -0.01248169 ... -0.01202393 -0.01217651
  -0.01254272]
 ...
 [-0.0100708  -0.00872803 -0.00650024 ... -0.01254272 -0.01089478
  -0.01132202]
 [-0.01348877 -0.00967407 -0.00897217 ... -0.01647949 -0.01019287
  -0.00961304]
 [-0.0055542  -0.00222778  0.00445557 ... -0.03439331 -0.0390625
  -0.03811646]]

#函数式要定义输入
input_image=Input((None,seq_vector))
#编码器 用CuDNNGRU可以用GPU，当然也可以用GRU
encoder=CuDNNGRU(units=512, return_sequences=True, name="gru1")(input_image)
encoder=CuDNNGRU(units=256, return_sequences=True,name="gru2")(encoder)
encoder=Dense(256,activation='tanh')(encoder)
encoder_out=Dense(32,activation='tanh')(encoder) 

encoder_model = Model(inputs=input_image, outputs=encoder_out)

#解码器，反过来  
decoder=Dense(256,activation='tanh')(encoder_out)
decoder=CuDNNGRU(units=256, return_sequences=True,name="de_gru1")(decoder)
decoder=CuDNNGRU(units=512, return_sequences=True, name="de_gru2")(decoder)
decoder_out=Dense(seq_vector,activation='tanh')(decoder)
                                                             
autoencoder=Model(input_image,decoder_out)

autoencoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input_4 (InputLayer)         (None, None, 100)         0         
_________________________________________________________________
gru1 (CuDNNGRU)              (None, None, 512)         943104    
_________________________________________________________________
gru2 (CuDNNGRU)              (None, None, 256)         591360    
_________________________________________________________________
dense_18 (Dense)             (None, None, 256)         65792     
_________________________________________________________________
dense_19 (Dense)             (None, None, 32)          8224      
_________________________________________________________________
dense_20 (Dense)             (None, None, 256)         8448      
_________________________________________________________________
de_gru1 (CuDNNGRU)           (None, None, 256)         394752    
_________________________________________________________________
de_gru2 (CuDNNGRU)           (None, None, 512)         1182720   
_________________________________________________________________
dense_21 (Dense)             (None, None, 100)         51300     
=================================================================
Total params: 3,245,700
Trainable params: 3,245,700
Non-trainable params: 0
_________________________________________________________________

autoencoder.compile(optimizer='adam', loss='mse')
autoencoder.fit(x_train, x_train, epochs=50, batch_size=2,verbose=1,               
                callbacks=[TensorBoard(log_dir='log_RNN声音自编码器'),
                           ModelCheckpoint("rnn.h5",
                                            monitor='loss', verbose=1, save_best_only=True, mode='min', period=1)])

Epoch 1/50
50/50 [==============================] - 5s 103ms/step - loss: 0.0060

Epoch 00001: loss improved from inf to 0.00604, saving model to rnn.h5
Epoch 2/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0053

Epoch 00002: loss improved from 0.00604 to 0.00532, saving model to rnn.h5
Epoch 3/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0039

Epoch 00003: loss improved from 0.00532 to 0.00385, saving model to rnn.h5
Epoch 4/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0028

Epoch 00004: loss improved from 0.00385 to 0.00277, saving model to rnn.h5
Epoch 5/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0021

Epoch 00005: loss improved from 0.00277 to 0.00211, saving model to rnn.h5
Epoch 6/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0015

Epoch 00006: loss improved from 0.00211 to 0.00148, saving model to rnn.h5
Epoch 7/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0012

Epoch 00007: loss improved from 0.00148 to 0.00117, saving model to rnn.h5
Epoch 8/50
50/50 [==============================] - 4s 85ms/step - loss: 0.0010

Epoch 00008: loss improved from 0.00117 to 0.00102, saving model to rnn.h5
Epoch 9/50
50/50 [==============================] - 4s 85ms/step - loss: 9.1125e-04

Epoch 00009: loss improved from 0.00102 to 0.00091, saving model to rnn.h5
Epoch 10/50
50/50 [==============================] - 4s 85ms/step - loss: 8.4070e-04

Epoch 00010: loss improved from 0.00091 to 0.00084, saving model to rnn.h5
Epoch 11/50
50/50 [==============================] - 4s 85ms/step - loss: 7.7816e-04

Epoch 00011: loss improved from 0.00084 to 0.00078, saving model to rnn.h5
Epoch 12/50
50/50 [==============================] - 4s 85ms/step - loss: 6.8798e-04

Epoch 00012: loss improved from 0.00078 to 0.00069, saving model to rnn.h5
Epoch 13/50
50/50 [==============================] - 4s 85ms/step - loss: 6.0970e-04

Epoch 00013: loss improved from 0.00069 to 0.00061, saving model to rnn.h5
Epoch 14/50
50/50 [==============================] - 4s 85ms/step - loss: 5.6702e-04

Epoch 00014: loss improved from 0.00061 to 0.00057, saving model to rnn.h5
Epoch 15/50
50/50 [==============================] - 4s 85ms/step - loss: 5.2083e-04

Epoch 00015: loss improved from 0.00057 to 0.00052, saving model to rnn.h5
Epoch 16/50
50/50 [==============================] - 4s 85ms/step - loss: 4.7999e-04

Epoch 00016: loss improved from 0.00052 to 0.00048, saving model to rnn.h5
Epoch 17/50
50/50 [==============================] - 4s 85ms/step - loss: 4.6281e-04

Epoch 00017: loss improved from 0.00048 to 0.00046, saving model to rnn.h5
Epoch 18/50
50/50 [==============================] - 4s 85ms/step - loss: 4.4957e-04

Epoch 00018: loss improved from 0.00046 to 0.00045, saving model to rnn.h5
Epoch 19/50
50/50 [==============================] - 4s 85ms/step - loss: 4.5229e-04

Epoch 00019: loss did not improve
Epoch 20/50
50/50 [==============================] - 4s 85ms/step - loss: 4.4043e-04

Epoch 00020: loss improved from 0.00045 to 0.00044, saving model to rnn.h5
Epoch 21/50
50/50 [==============================] - 4s 85ms/step - loss: 4.2521e-04: 1s - loss: 4.07

Epoch 00021: loss improved from 0.00044 to 0.00043, saving model to rnn.h5
Epoch 22/50
50/50 [==============================] - 4s 85ms/step - loss: 4.1621e-04

Epoch 00022: loss improved from 0.00043 to 0.00042, saving model to rnn.h5
Epoch 23/50
50/50 [==============================] - 4s 85ms/step - loss: 4.0283e-04

Epoch 00023: loss improved from 0.00042 to 0.00040, saving model to rnn.h5
Epoch 24/50
50/50 [==============================] - 4s 85ms/step - loss: 3.9148e-04

Epoch 00024: loss improved from 0.00040 to 0.00039, saving model to rnn.h5
Epoch 25/50
50/50 [==============================] - 4s 85ms/step - loss: 3.9139e-04

Epoch 00025: loss improved from 0.00039 to 0.00039, saving model to rnn.h5
Epoch 26/50
50/50 [==============================] - 4s 85ms/step - loss: 3.7617e-04

Epoch 00026: loss improved from 0.00039 to 0.00038, saving model to rnn.h5
Epoch 27/50
50/50 [==============================] - 4s 85ms/step - loss: 3.7279e-04

Epoch 00027: loss improved from 0.00038 to 0.00037, saving model to rnn.h5
Epoch 28/50
50/50 [==============================] - 4s 85ms/step - loss: 3.5802e-04

Epoch 00028: loss improved from 0.00037 to 0.00036, saving model to rnn.h5
Epoch 29/50
50/50 [==============================] - 4s 85ms/step - loss: 3.4530e-04

Epoch 00029: loss improved from 0.00036 to 0.00035, saving model to rnn.h5
Epoch 30/50
50/50 [==============================] - 4s 85ms/step - loss: 3.5566e-04

Epoch 00030: loss did not improve
Epoch 31/50
50/50 [==============================] - 4s 85ms/step - loss: 3.4619e-04

Epoch 00031: loss did not improve
Epoch 32/50
50/50 [==============================] - 4s 85ms/step - loss: 3.3743e-04

Epoch 00032: loss improved from 0.00035 to 0.00034, saving model to rnn.h5
Epoch 33/50
50/50 [==============================] - 4s 85ms/step - loss: 3.2660e-04

Epoch 00033: loss improved from 0.00034 to 0.00033, saving model to rnn.h5
Epoch 34/50
50/50 [==============================] - 4s 85ms/step - loss: 3.1028e-04

Epoch 00034: loss improved from 0.00033 to 0.00031, saving model to rnn.h5
Epoch 35/50
50/50 [==============================] - 4s 85ms/step - loss: 2.9729e-04

Epoch 00035: loss improved from 0.00031 to 0.00030, saving model to rnn.h5
Epoch 36/50
50/50 [==============================] - 4s 85ms/step - loss: 2.9330e-04

Epoch 00036: loss improved from 0.00030 to 0.00029, saving model to rnn.h5
Epoch 37/50
50/50 [==============================] - 4s 85ms/step - loss: 2.9591e-04

Epoch 00037: loss did not improve
Epoch 38/50
50/50 [==============================] - 4s 85ms/step - loss: 2.8852e-04

Epoch 00038: loss improved from 0.00029 to 0.00029, saving model to rnn.h5
Epoch 39/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7403e-04

Epoch 00039: loss improved from 0.00029 to 0.00027, saving model to rnn.h5
Epoch 40/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7536e-04

Epoch 00040: loss did not improve
Epoch 41/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7173e-04

Epoch 00041: loss improved from 0.00027 to 0.00027, saving model to rnn.h5
Epoch 42/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6409e-04

Epoch 00042: loss improved from 0.00027 to 0.00026, saving model to rnn.h5
Epoch 43/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6014e-04

Epoch 00043: loss improved from 0.00026 to 0.00026, saving model to rnn.h5
Epoch 44/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5614e-04

Epoch 00044: loss improved from 0.00026 to 0.00026, saving model to rnn.h5
Epoch 45/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5297e-04

Epoch 00045: loss improved from 0.00026 to 0.00025, saving model to rnn.h5
Epoch 46/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5371e-04

Epoch 00046: loss did not improve
Epoch 47/50
50/50 [==============================] - 4s 85ms/step - loss: 2.5559e-04

Epoch 00047: loss did not improve
Epoch 48/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6840e-04

Epoch 00048: loss did not improve
Epoch 49/50
50/50 [==============================] - 4s 85ms/step - loss: 2.7194e-04

Epoch 00049: loss did not improve
Epoch 50/50
50/50 [==============================] - 4s 85ms/step - loss: 2.6571e-04

Epoch 00050: loss did not improve





<keras.callbacks.History at 0x7380fc88>

filePath = "rnn_wavs/test"

testPath = get_wav_path(filePath)

x_test_ori=get_data(testPath)

x_test=x_test_ori.reshape(x_test_ori.shape[0],-1,seq_vector)

开始预处理文件
100.00 % [==================================================>] 4/4 	 已用时间:0s 剩余时间:0 s

decoded_seq = autoencoder.predict(x_test)
print(decoded_seq.shape)

(4, 480, 100)

d_shape=decoded_seq[0].shape
print(d_shape)

(480, 100)

decoded_seq[0]

array([[-0.01014584, -0.01277211, -0.01327355, ..., -0.01386975,
        -0.0145765 , -0.01456202],
       [-0.01393538, -0.01367248, -0.01350442, ..., -0.01421544,
        -0.01455196, -0.0140023 ],
       [-0.01335981, -0.01219707, -0.01206582, ..., -0.0150555 ,
        -0.0149645 , -0.01383976],
       ...,
       [-0.09125455, -0.10162869, -0.12252352, ..., -0.01989237,
        -0.01427485,  0.01067922],
       [-0.00659992, -0.00485723, -0.0141265 , ...,  0.26765165,
         0.37300578,  0.42874205],
       [ 0.5345399 ,  0.59415686,  0.611799  , ...,  0.20478293,
         0.2866281 ,  0.3082694 ]], dtype=float32)

index=0
seq=decoded_seq[index].flatten()
show_wave(seq,'rnn 复原')

在这里插入图片描述

show_wave(x_test_ori[index],'原始')

在这里插入图片描述

#获取特征向量，可转成特征图片
encoded_latent = encoder_model.predict(x_test)
print(encoded_latent.shape)

(4, 480, 32)

#特征图片
def show_latent_images(start=0,end=4):
    plt.figure(figsize=(20, 10))
    for i in range(start,end):
        ax = plt.subplot(2,end, i+1)
        plt.imshow(encoded_latent[i].reshape(160, -1).T,cmap='binary')
    plt.show()

show_latent_images()

在这里插入图片描述

testPath[0]

'rnn_wavs/test/A2/A2_239.wav'

#播放音频rnn_wavs/test/A2/A2_239.wav
import IPython
out_wav=testPath[0]
signal, srate = librosa.load(out_wav, sr=16000)
IPython.display.Audio(out_wav)

在这里插入图片描述

#取3秒
signal=signal[0:3*16000]

#播放前3秒的
out_new='3s.wav'
librosa.output.write_wav(out_new,signal,16000)
IPython.display.Audio(out_new)

在这里插入图片描述

#播放复原的3秒音频
out_new='new_3s.wav'
librosa.output.write_wav(out_new,seq,16000)
IPython.display.Audio(out_new)

在这里插入图片描述
好了，今天就到这里了，希望对学习理解有帮助，大神看见勿喷，仅为自己的学习理解，能力有限，请多包涵，侵删。