pAura，python音频录制分析和训练分类

绯虹剑心

已于 2022-10-17 10:45:15 修改

阅读量786

点赞数

分类专栏：人工智能文章标签： python 机器学习音频

于 2022-10-13 17:23:07 首次发布

本文链接：https://blog.csdn.net/weixin_43422012/article/details/127301942

版权

人工智能专栏收录该内容

12 篇文章 1 订阅

订阅专栏

pAura

python音频录制和分析
https://github.com/tyiannak/paura

安装依赖

系统：ubuntu-18.04.5-desktop-amd64.iso

# 安装pythonlib
pip install pyAudioAnalysis
pip install numpy
# 如果速度较慢，可以加镜像 -i 参数，如
pip install scipy -i https://pypi.tuna.tsinghua.edu.cn/simple
pip install opencv-python matplotlib tqdm eyed3 pydub imblearn plotly pyaudio -i https://pypi.tuna.tsinghua.edu.cn/simple   
# Install portaudio
sudo apt-get install libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0
# install opencv for python
sudo apt-get install python-opencv
# required only for paura_lite.py
sudo apt-get install gnuplot

录制音频并分类

python paura.py --blocksize 1.0 --spectrogram --chromagram --record_segments --record_all

deep_audio_features

训练卷积神经网络作为音频分类器
https://github.com/tyiannak/deep_audio_features

采集数据

可以用拾音器和pAura采集数据

训练

from deep_audio_features.bin import basic_training as bt
bt.train_model(["computer","keyboard"], "energy")

测试

from deep_audio_features.bin import basic_test as btest
d, p = btest.test_model("pkl/energy.pt", '1.wav', layers_dropped=0, test_segmentation=False)
print("==> d",d)
print("==> p",p)
class_names = ["computer","keyboard"]
print("==> class",class_names[d[0]])

pAura 和 deep_audio_features 组合使用

主要修改record_audio函数，使用pAura 的录音功能和把pAura的声音分类算法替换为deep_audio_features 的算法

import sys
import time
import numpy
import scipy
import cv2
import argparse
import scipy.io.wavfile as wavfile
from pyAudioAnalysis import ShortTermFeatures as sF
from pyAudioAnalysis import MidTermFeatures as mF
from pyAudioAnalysis import audioTrainTest as aT
import scipy.signal
import itertools
import operator
import datetime
import signal
import pyaudio
import os
import struct
from deep_audio_features.bin import basic_test as btest

global fs
global all_data
global outstr
fs = 8000
FORMAT = pyaudio.paInt16
all_data = []
plot_h = 150
plot_w = 720
status_h = 150


def signal_handler(signal, frame):
    """
    This function is called when Ctr + C is pressed and is used to output the
    final buffer into a WAV file
    """
    # write final buffer to wav file
    if len(all_data) > 1:
        wavfile.write(outstr + ".wav", fs, numpy.int16(all_data))
    sys.exit(0)


signal.signal(signal.SIGINT, signal_handler)



"""
Utility functions
"""


def most_common(L):
    # get an iterable of (item, iterable) pairs
    SL = sorted((x, i) for i, x in enumerate(L))
    groups = itertools.groupby(SL, key=operator.itemgetter(0))

    # auxiliary function to get "quality" for an item
    def _auxfun(g):
        item, iterable = g
        count = 0
        min_index = len(L)
        for _, where in iterable:
            count += 1
            min_index = min(min_index, where)
        return count, -min_index

    # pick the highest-count/earliest item
    return max(groups, key=_auxfun)[0]


def plotCV(function, width, height, max_val):
    if len(function) > width:
        hist_item = height * (function[len(function) - width - 1:-1] / max_val)
    else:
        hist_item = height * (function / max_val)
    h = numpy.zeros((height, width, 3))
    hist = numpy.int32(numpy.around(hist_item))

    for x, y in enumerate(hist):
        cv2.line(h, (x, int(height / 2)),
                 (x, height - y), (255, 0, 255))

    return h


"""
Core functionality:
"""


def record_audio(block_size, fs=8000, show_spec=False, show_chroma=False,
                 log_sounds=False, logs_all=False):

    # inialize recording process
    mid_buf_size = int(fs * block_size)
    pa = pyaudio.PyAudio()
    stream = pa.open(format=FORMAT, channels=1, rate=fs,
                     input=True, frames_per_buffer=mid_buf_size)
    mid_buf = []
    count = 0
    global all_data
    global outstr
    all_data = []
    # initalize counters etc
    time_start = time.time()
    outstr = datetime.datetime.now().strftime("%Y_%m_%d_%I:%M%p")
    out_folder = outstr + "_segments"
    if log_sounds:
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)
    # load segment model
    # [classifier, MEAN, STD, class_names, mt_win, mt_step, st_win, st_step, _] = aT.load_model("model")
    
    while 1:
        try:
            block = stream.read(mid_buf_size)
            count_b = len(block) / 2
            format = "%dh" % (count_b)
            shorts = struct.unpack(format, block)
            cur_win = list(shorts)
            mid_buf = mid_buf + cur_win
            del cur_win

            # time since recording started:
            e_time = (time.time() - time_start)
            # data-driven time
            data_time = (count + 1) * block_size
            x = numpy.int16(mid_buf)
            seg_len = len(x)

            if log_sounds:
                # TODO: log audio files
                out_file = os.path.join(out_folder,
                                        "{0:.2f}_".format(e_time).zfill(8) +
                                        ".wav")
                #shutil.copyfile("temp.wav", out_file)
                wavfile.write(out_file, fs, x)
            
            print("==> out_file",out_file)
            d, p = btest.test_model("pkl/energy.pt", out_file, layers_dropped=0, test_segmentation=False)
            print("==> d",d)
            print("==> p",p)
            class_names = ["computer","keyboard"]            
            print("==> class",class_names[d[0]])
            print("{0:.2f}\t{1:s}".format(e_time,
                                        class_names[d[0]]))
            
            mid_buf = []
            ch = cv2.waitKey(10)
            count += 1
        except IOError:
            print("Error recording")


def parse_arguments():
    record_analyze = argparse.ArgumentParser(description="Real time "
                                                         "audio analysis")
    record_analyze.add_argument("-bs", "--blocksize",
                                  type=float, choices=[0.25, 0.5, 0.75, 1, 2],
                                  default=1, help="Recording block size")
    record_analyze.add_argument("-fs", "--samplingrate", type=int,
                                  choices=[4000, 8000, 16000, 32000, 44100],
                                  default=8000, help="Recording block size")
    record_analyze.add_argument("--chromagram", action="store_true",
                                  help="Show chromagram")
    record_analyze.add_argument("--spectrogram", action="store_true",
                                  help="Show spectrogram")
    record_analyze.add_argument("--record_segments", action="store_true",
                                  help="Record detected sounds to wavs")
    record_analyze.add_argument("--record_all", action="store_true",
                                  help="Record the whole recording to a single"
                                       " audio file")
    return record_analyze.parse_args()


if __name__ == "__main__":
    args = parse_arguments()
    fs = args.samplingrate
    if fs != 8000:
        print("Warning! Segment classifiers have been trained on 8KHz samples. "
              "Therefore results will be not optimal. ")
    record_audio(args.blocksize, fs, args.spectrogram,
                 args.chromagram, args.record_segments, args.record_all)