语音音频处理相关代码

郭庆汝
于 2024-08-08 14:41:46 发布
阅读量215
点赞数 3
文章标签：音视频
本文链接：https://blog.csdn.net/guoqingru0311/article/details/141026620
版权
语音音频处理相关代码

import argparse
import functools
import os,shutil
import sys,json
import glob
import time,uuid

from flask import request, Flask, render_template
from flask_cors import CORS
from werkzeug.serving import run_simple
from paddlespeech.cli.log import logger
from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler

import asyncio
import soundfile as sf
from pydub import AudioSegment
import numpy as np


def check_wav_and_convert_to_16khz(input_wav, output_wav):
    # 读取音频文件并获取采样率
    data, samplerate = sf.read(input_wav)
    # 计算原始音频时长
    original_duration = len(data) / samplerate
    # 如果采样率不是16kHz，则进行转换
    if samplerate != 16000:
        print(f"采样率 {samplerate} Hz 不是 16kHz,正在转换...")
        
        # 使用pydub读取和转换音频
        audio = AudioSegment.from_wav(input_wav)
        audio = audio.set_frame_rate(16000)
        
        # 导出音频为新的wav文件
        audio.export(output_wav, format="wav")
        print(f"音频已保存到 {output_wav}，采样率为 16kHz")
        # 获取转换后音频的时长
        converted_duration = audio.duration_seconds
        return converted_duration   # 返回音频时长
    else:
        print(f"采样率已为 16kHz，无需转换。")
        # 将音频数据保存到新的文件
        sf.write(output_wav, data, samplerate)
        return original_duration  # 返回音频时长



# 读取m4a格式音频文件转换为 16kHz 采样率的 .wav 文件
def convert_m4a_to_wav_16khz(input_file, output_file):
    # Load .m4a file
    audio = AudioSegment.from_file(input_file, format="m4a")
    
    # Set the frame rate to 16000 Hz
    audio = audio.set_frame_rate(16000)
    
    # Export as .wav file
    audio.export(output_file, format="wav")
    print(f"Converted {input_file} to {output_file} with 16kHz sampling rate")
    # 得到时长
    duration = audio.duration_seconds
    return duration





app = Flask(__name__, template_folder="templates", static_folder="static", static_url_path="/")
# 允许跨越访问
CORS(app)


server_ip="192.168.10.198"  # 语音识别服务器IP
server_port=8080            # 语音识别服务器端口

punc_server_ip="192.168.10.198"     #标点检测模型服务IP
punc_server_port=8087            #标点检测模型服务端口
endpoint="/paddlespeech/asr/streaming"   # ASR websocket endpoint

handler = ASRWsAudioHandler(
                        server_ip,
                        server_port,
                        endpoint=endpoint,
                        punc_server_ip=punc_server_ip,
                        punc_server_port=punc_server_port)

loop = asyncio.get_event_loop()    # 是一个用于获取当前事件循环的便捷函数。它在没有事件循环时会创建一个新的事件循环


# 语音识别接口
@app.route("/recognition", methods=['POST'])
def recognition():
    """
    # 生成新的文件名
        new_filename = str(uuid.uuid4()) + os.path.splitext(f.filename)[1]  # 保留原文件扩展名
        file_path = os.path.join(args.save_path, new_filename)
    
    """
    f = request.files['audio']
    if f:

        audio_savedir = "./audio_savedir"
        if not os.path.exists(audio_savedir):
            os.makedirs(audio_savedir)

        # 音频文件格式
        audio_format=(os.path.splitext(f.filename)[1]).lower()
        
        
        # new_filename=f.filename
        new_filename = str(uuid.uuid4())
        new_filename_all=new_filename+ os.path.splitext(f.filename)[1]  # 保留原文件扩展名
        new_filename_all_convert="new_"+new_filename+ ".wav"  # 

        # 创建隶属文件目录
        os.makedirs(os.path.join(audio_savedir, new_filename))

        # 临时保存路径
        file_path = os.path.join(audio_savedir, new_filename,new_filename_all)
        file_path_convert = os.path.join(audio_savedir, new_filename,new_filename_all_convert)
        f.save(file_path)

        start_time = time.time()
        # 进行格式校验，语音预先处理
        if audio_format==".wav":
            audio_time=check_wav_and_convert_to_16khz(file_path,file_path_convert)

        if audio_format==".m4a":
            audio_time=convert_m4a_to_wav_16khz(file_path,file_path_convert)
        try:

            # logger.info(f"start to process the wavscp: {file_path_convert}")
            result = loop.run_until_complete(handler.run(file_path_convert))
            end_time = time.time()
            result = result["result"]
            logger.info(f"\n\n检测用时{round((end_time -start_time ))}秒,asr websocket client finished : {result}")
            result = str({"code": 0, "msg": "success", "result": result}).replace("'", '"')
            return result
            
        except Exception as e:
            logger.error(f"asr websocket client error : {e}")
            return str({"error": 1, "msg": str(e)})
        finally:
            # 删除临时文件
            if os.path.exists(os.path.join(audio_savedir, new_filename)):
                # 使用shutil.rmtree删除文件夹及其所有内容
                # shutil.rmtree(os.path.join(audio_savedir, new_filename))
                pass

    return str({"error": 3, "msg": "audio is None!"})



# 长语音识别接口
def recongition_long_audio_TOOL(file_path_convert):
    # logger.info(f"start to process the wavscp: {file_path_convert}")
    result = loop.run_until_complete(handler.run(file_path_convert))
    end_time = time.time()
    result = result["result"]
    logger.info(f"\n\n检测用时{round((end_time -0 ))}秒,asr websocket client finished : {result}")
    result = str({"code": 0, "msg": "success", "result": result}).replace("'", '"')
    return result


# 读取音频文件
def read_wave_(wavfile_path,TIME_segmentation):
        """read the audio file from specific wavfile path

        Args:
            wavfile_path (str): the audio wavfile, 
                                 we assume that audio sample rate matches the model

        Yields:
            numpy.array: the samall package audio pcm data
        """
        samples, sample_rate = sf.read(wavfile_path, dtype='int16')   # 读取音频文件并返回音频数据和采样率
        x_len = len(samples)  # 获取音频数据的长度
        assert sample_rate == 16000

        chunk_size = int(TIME_segmentation*1000 * sample_rate / 1000)  # (TIME_segmentation*1000)ms, sample_rate = 16kHz   sample_rate / 1000 将采样率从每秒样本数转换为每毫秒样本数（因为 1 秒 = 1000 毫秒） 85 * sample_rate / 1000 计算 85 毫秒内的样本数。

        if x_len % chunk_size != 0:
            padding_len_x = chunk_size - x_len % chunk_size
        else:
            padding_len_x = 0

        padding = np.zeros((padding_len_x), dtype=samples.dtype)
        padded_x = np.concatenate([samples, padding], axis=0)

        assert (x_len + padding_len_x) % chunk_size == 0
        num_chunk = (x_len + padding_len_x) / chunk_size   # 求得音频数据被切分成多少个 (TIME_segmentation*1000) 毫秒的块
        num_chunk = int(num_chunk)

        directory = os.path.dirname(wavfile_path)   # 获得音频文件所在目录
        file_name_tuple = os.path.basename(wavfile_path).split(".")  # 获取文件名与后缀名
        for i in range(0, num_chunk):
            start = i * chunk_size
            end = start + chunk_size
            x_chunk = padded_x[start:end] # 将音频数据切分成 (TIME_segmentation*1000) 毫秒的块

            file_name=file_name_tuple[0]+"_"+str(i).zfill(6)+"."+file_name_tuple[1]     # 生成新的文件名
            output_file = os.path.join(directory, file_name)   # 拼接路径
            sf.write(output_file, x_chunk, sample_rate)
            # sf.write(output_file, x_chunk, sample_rate)
        

# 长语音识别接口
@app.route("/recognition_long_audio", methods=['POST'])
def recognition_long_audio():
    f = request.files['audio']
    """
    # 生成新的文件名
        new_filename = str(uuid.uuid4()) + os.path.splitext(f.filename)[1]  # 保留原文件扩展名
        file_path = os.path.join(args.save_path, new_filename)
    
    """
    f = request.files['audio']
    if f:

        audio_savedir = "./audio_savedir"
        if not os.path.exists(audio_savedir):
            os.makedirs(audio_savedir)

        # 音频文件格式
        audio_format=(os.path.splitext(f.filename)[1]).lower()
        
        
        # new_filename=f.filename
        new_filename = str(uuid.uuid4())
        new_filename_all=new_filename+ os.path.splitext(f.filename)[1]  # 保留原文件扩展名
        new_filename_all_convert="new_"+new_filename+ ".wav"  # 

        # 创建隶属文件目录
        os.makedirs(os.path.join(audio_savedir, new_filename))

        # 临时保存路径
        file_path = os.path.join(audio_savedir, new_filename,new_filename_all)
        file_path_convert = os.path.join(audio_savedir, new_filename,new_filename_all_convert)
        f.save(file_path)

        start_time = time.time()
        audio_time=0
        # 进行格式校验，语音预先处理
        if audio_format==".wav":
            audio_time=check_wav_and_convert_to_16khz(file_path,file_path_convert)
            print("-----------------------------------------",audio_time)

        
        if audio_format==".m4a":
            audio_time=convert_m4a_to_wav_16khz(file_path,file_path_convert)
            print("-----------------------------------------",audio_time)
        
        TIME_segmentation=40        # 判断音频时间是否大于40秒
        if audio_time>TIME_segmentation:
            read_wave_(file_path_convert,TIME_segmentation)   # 读取语音音频，进行分割预处理

            directory = os.path.dirname(file_path_convert)   # 获得音频文件所在目录
            file_name_tuple = os.path.basename(file_path_convert).split(".")  # 获取文件名与后缀名

            # 指定匹配模式
            pattern = os.path.join(directory,  file_name_tuple[0]+"_*.wav")
            # 获取匹配的文件列表
            matching_files = glob.glob(pattern)

            # 提取文件名并排序
            sorted_files = sorted(matching_files, key=lambda x: int(os.path.splitext(os.path.basename(x))[0].split('_')[-1]))

            info_list=[]  # 用于存储得到的音频信息
            try:
                for index,file in enumerate(sorted_files):
                    info_temp=recongition_long_audio_TOOL(file)  # 语音识别逻辑函数
                    if index!=(len(sorted_files)-1) and len(sorted_files)!=1:
                        message = json.loads(info_temp)["result"][:-1]
                        info_list.append(message)
                    else:
                        message = json.loads(info_temp)["result"]
                        info_list.append(message)
                return result
            except Exception as e:
                logger.error(f"asr websocket client error : {e}")
                return str({"error": 1, "msg": str(e)})
            finally:
                # 删除临时文件
                if os.path.exists(os.path.join(audio_savedir, new_filename)):
                    # 使用shutil.rmtree删除文件夹及其所有内容
                    # shutil.rmtree(os.path.join(audio_savedir, new_filename))
                    pass
            

        else:
            try:
                result=recongition_long_audio_TOOL(file_path_convert)  # 语音识别逻辑函数
                return result
            except Exception as e:
                logger.error(f"asr websocket client error : {e}")
                return str({"error": 1, "msg": str(e)})
            finally:
                # 删除临时文件
                if os.path.exists(os.path.join(audio_savedir, new_filename)):
                    # 使用shutil.rmtree删除文件夹及其所有内容
                    # shutil.rmtree(os.path.join(audio_savedir, new_filename))
                    pass


       

    return str({"error": 3, "msg": "audio is None!"})



@app.route('/')
def home():
    return render_template("index.html")


if __name__ == '__main__':
    

    # app.run(host="0.0.0.0", port=8088,ssl_context=('cert.pem', 'key.pem'))
    app.run(host="0.0.0.0", port=8088)