识别特定语音信号

修改自己的pb文件位置,和args.list的大小即可使用

# -*- coding: utf-8 -*-
"""
Created on Sat Nov 10 16:59:03 2018

@author: chen
"""

import librosa
import tensorflow as tf
import pyaudio
import sys
import time
import array
import numpy as np
import queue
from collections import deque
from easydict import EasyDict

args=EasyDict()
args.duration=1
args.rate=44100
args.hop_length=80
args.n_mels=40
args.rt_oversamples=10
args.samples=args.rate*args.duration
args.list=(args.n_mels,1+int(np.floor(args.samples/args.hop_length)),1)
args.rt_process_count=1
args.model='alexnet'
args.rt_chunk_samples = args.rate // args.rt_oversamples
args.mels_onestep_samples = args.rt_chunk_samples * args.rt_process_count
args.mels_convert_samples=args.samples + args.mels_onestep_samples
args.fmax = args.rate // 2
args.n_fft = args.n_mels * 20
args.labels = ['dog_bark', 'children_playing', 'car_horn', 'air_conditioner',
               'street_music', 'gun_shot', 'siren', 'engine_idling', 'jackhammer',
               'drilling']

args.mels_convert_samples = args.samples + args.mels_onestep_samples

graph_file="E:\\ML\\UrbanSound8K\\code\\UrbanSound8K\\others sussfer\\first\\99.pb"

def audio_to_melspectrogram(args, audio):
    spectrogram = librosa.feature.melspectrogram(audio, 
                                                 sr=args.rate,
                                                 n_mels=args.n_mels,
                                                 hop_length=args.hop_length,
                                                 n_fft=args.n_fft,
                                                 fmin=20,
                                                 fmax=args.fmax)
    spectrogram = librosa.power_to_db(spectrogram)
    spectrogram = spectrogram.astype(np.float32)
    return spectrogram


def callback(in_data, frame_count, time_info, status):
    wave = array.array('b', in_data)
    raw_frames.put(wave, True)
    return (None, pyaudio.paContinue)

def on_predicted(ensembled_pred):
    result = np.argmax(ensembled_pred)
    print(args.labels[result], ensembled_pred[result])
def samplewise_normalize_audio_X(X):
    for i in range(len(X)):
        X[i] -= np.min(X[i])
        X[i] /= (np.max(np.abs(X[i])) + 1.0)
        
def geometric_mean_preds(_preds):
    preds = _preds.copy()
    for i in range(1, preds.shape[0]):
        preds[0] = np.multiply(preds[0], preds[i])
    return np.power(preds[0], 1/preds.shape[0])

raw_frames = queue.Queue(maxsize=100)
raw_audio_buffer = []
pred_queue = deque(maxlen=10)
def main_process(model, on_predicted):
    
    global raw_audio_buffer
    while not raw_frames.empty():
        raw_audio_buffer.extend(raw_frames.get())
        if len(raw_audio_buffer) >= args.mels_convert_samples: break
    if len(raw_audio_buffer) < args.mels_convert_samples: return
   
    audio_to_convert = np.array(raw_audio_buffer[:args.mels_convert_samples]) / 32767
    raw_audio_buffer = raw_audio_buffer[args.mels_onestep_samples:]
    mels = audio_to_melspectrogram(args, audio_to_convert)
    
    X = []
    for i in range(args.rt_process_count):
        cur = int(i * args.list[1] / args.rt_oversamples)
        X.append(mels[:, cur:cur+args.list[1], np.newaxis])
    X = np.array(X)
    samplewise_normalize_audio_X(X)
    raw_preds = model.predict(X)
    for raw_pred in raw_preds:
        pred_queue.append(raw_pred)
        ensembled_pred = geometric_mean_preds(np.array([pred for pred in pred_queue]))
        on_predicted(ensembled_pred)

def load_graph(model_file):
    graph = tf.Graph()
    graph_def = tf.GraphDef()

    with open(model_file, "rb") as f:
        graph_def.ParseFromString(f.read())
    with graph.as_default():
        tf.import_graph_def(graph_def)
    return graph

def my_exit(model):
    model.close()
    exit(0)
class KerasTFGraph:
    def __init__(self, model_pb_filename, input_name,
                 keras_learning_phase_name, output_name):
        self.graph = load_graph(model_pb_filename)
        self.layer_in = self.graph.get_operation_by_name(input_name)
        self.leayer_klp = self.graph.get_operation_by_name(keras_learning_phase_name)
        self.layer_out = self.graph.get_operation_by_name(output_name)
        self.sess = tf.Session(graph=self.graph)
    def predict(self, X):
        preds = self.sess.run(self.layer_out.outputs[0], 
                              {self.layer_in.outputs[0]: X,
                               self.leayer_klp.outputs[0]: 0})
        return preds
    def close(self):
        self.sess.close()
def get_model(graph_file):
    model_node = {
        'alexnet': ['import/conv2d_1_input',
                    'import/batch_normalization_1/keras_learning_phase',
                    'import/output0']
        
    }
    return KerasTFGraph(
        args.runtime_model_file if graph_file == '' else graph_file,
        input_name=model_node[args.model][0],
        keras_learning_phase_name=model_node[args.model][1],
        output_name=model_node[args.model][2])

def run_predictor():
    model = get_model(graph_file)
    
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    audio = pyaudio.PyAudio()
    stream = audio.open(
                format=FORMAT,
                channels=CHANNELS,
                rate=args.rate,
                input=True,
                
                frames_per_buffer=1024,
                start=False,
                stream_callback=callback 
            )
    
    stream.start_stream()
    while stream.is_active():
        main_process(model, on_predicted)
        time.sleep(1)
    stream.stop_stream()
    stream.close()
    
    audio.terminate()
    my_exit(model)


run_predictor()

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值