识别特定语音信号

最新推荐文章于 2022-06-27 19:50:13 发布

岚DEMO

最新推荐文章于 2022-06-27 19:50:13 发布

阅读量990

点赞数 1

分类专栏：深度学习

本文链接：https://blog.csdn.net/c2c2c2aa/article/details/84072921

版权

深度学习专栏收录该内容

10 篇文章 0 订阅

订阅专栏

修改自己的pb文件位置，和args.list的大小即可使用

# -*- coding: utf-8 -*-
"""
Created on Sat Nov 10 16:59:03 2018

@author: chen
"""

import librosa
import tensorflow as tf
import pyaudio
import sys
import time
import array
import numpy as np
import queue
from collections import deque
from easydict import EasyDict

args=EasyDict()
args.duration=1
args.rate=44100
args.hop_length=80
args.n_mels=40
args.rt_oversamples=10
args.samples=args.rate*args.duration
args.list=(args.n_mels,1+int(np.floor(args.samples/args.hop_length)),1)
args.rt_process_count=1
args.model='alexnet'
args.rt_chunk_samples = args.rate // args.rt_oversamples
args.mels_onestep_samples = args.rt_chunk_samples * args.rt_process_count
args.mels_convert_samples=args.samples + args.mels_onestep_samples
args.fmax = args.rate // 2
args.n_fft = args.n_mels * 20
args.labels = ['dog_bark', 'children_playing', 'car_horn', 'air_conditioner',
               'street_music', 'gun_shot', 'siren', 'engine_idling', 'jackhammer',
               'drilling']

args.mels_convert_samples = args.samples + args.mels_onestep_samples

graph_file="E:\\ML\\UrbanSound8K\\code\\UrbanSound8K\\others sussfer\\first\\99.pb"

def audio_to_melspectrogram(args, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=args.rate,
                                                 n_mels=args.n_mels,
                                                 hop_length=args.hop_length,
                                                 n_fft=args.n_fft,
                                                 fmin=20,
                                                 fmax=args.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram


def callback(in_data, frame_count, time_info, status):
    wave = array.array('b', in_data)
    raw_frames.put(wave, True)
    return (None, pyaudio.paContinue)

def on_predicted(ensembled_pred):
    result = np.argmax(ensembled_pred)
    print(args.labels[result], ensembled_pred[result])
def samplewise_normalize_audio_X(X):
    for i in range(len(X)):
        X[i] -= np.min(X[i])
        X[i] /= (np.max(np.abs(X[i])) + 1.0)
        
def geometric_mean_preds(_preds):
    preds = _preds.copy()
    for i in range(1, preds.shape[0]):
        preds[0] = np.multiply(preds[0], preds[i])
    return np.power(preds[0], 1/preds.shape[0])

raw_frames = queue.Queue(maxsize=100)
raw_audio_buffer = []
pred_queue = deque(maxlen=10)
def main_process(model, on_predicted):
    
    global raw_audio_buffer
    while not raw_frames.empty():
        raw_audio_buffer.extend(raw_frames.get())
        if len(raw_audio_buffer) >= args.mels_convert_samples: break
    if len(raw_audio_buffer) < args.mels_convert_samples: return
   
    audio_to_convert = np.array(raw_audio_buffer[:args.mels_convert_samples]) / 32767
    raw_audio_buffer = raw_audio_buffer[args.mels_onestep_samples:]
    mels = audio_to_melspectrogram(args, audio_to_convert)
    
    X = []
    for i in range(args.rt_process_count):
        cur = int(i * args.list[1] / args.rt_oversamples)
        X.append(mels[:, cur:cur+args.list[1], np.newaxis])
    X = np.array(X)
    samplewise_normalize_audio_X(X)
    raw_preds = model.predict(X)
    for raw_pred in raw_preds:
        pred_queue.append(raw_pred)
        ensembled_pred = geometric_mean_preds(np.array([pred for pred in pred_queue]))
        on_predicted(ensembled_pred)

def load_graph(model_file):
    graph = tf.Graph()
    graph_def = tf.GraphDef()

    with open(model_file, "rb") as f:
        graph_def.ParseFromString(f.read())
    with graph.as_default():
        tf.import_graph_def(graph_def)
    return graph

def my_exit(model):
    model.close()
    exit(0)
class KerasTFGraph:
    def __init__(self, model_pb_filename, input_name,
                 keras_learning_phase_name, output_name):
        self.graph = load_graph(model_pb_filename)
        self.layer_in = self.graph.get_operation_by_name(input_name)
        self.leayer_klp = self.graph.get_operation_by_name(keras_learning_phase_name)
        self.layer_out = self.graph.get_operation_by_name(output_name)
        self.sess = tf.Session(graph=self.graph)
    def predict(self, X):
        preds = self.sess.run(self.layer_out.outputs[0], 
                              {self.layer_in.outputs[0]: X,
                               self.leayer_klp.outputs[0]: 0})
        return preds
    def close(self):
        self.sess.close()
def get_model(graph_file):
    model_node = {
        'alexnet': ['import/conv2d_1_input',
                    'import/batch_normalization_1/keras_learning_phase',
                    'import/output0']
        
    }
    return KerasTFGraph(
        args.runtime_model_file if graph_file == '' else graph_file,
        input_name=model_node[args.model][0],
        keras_learning_phase_name=model_node[args.model][1],
        output_name=model_node[args.model][2])

def run_predictor():
    model = get_model(graph_file)
    
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    audio = pyaudio.PyAudio()
    stream = audio.open(
                format=FORMAT,
                channels=CHANNELS,
                rate=args.rate,
                input=True,
                
                frames_per_buffer=1024,
                start=False,
                stream_callback=callback 
            )
    
    stream.start_stream()
    while stream.is_active():
        main_process(model, on_predicted)
        time.sleep(1)
    stream.stop_stream()
    stream.close()
    
    audio.terminate()
    my_exit(model)


run_predictor()

岚DEMO

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
识别特定语音信号

修改自己的pb文件位置，和args.list的大小即可使用# -*- coding: utf-8 -*-"""Created on Sat Nov 10 16:59:03 2018@author: chen"""import librosaimport tensorflow as tfimport pyaudioimport sysimport timeimport ar...
复制链接

扫一扫