emotion2vec模型部署-CSDN博客

本文链接：https://blog.csdn.net/2301_80441062/article/details/147946793

前言：此项目是识别说话人的情绪，通过语音分析和音转文字后两者的一个综合情绪分析。本文详细介绍了本地部署 emotion2vec模型的实现，还有一些在自己专业学习中学到的方法，分享给大家。

一.大致理论流程

1.数据预处理

这里使用的是一个简单易用的 Python 音频处理库pydub。将音频转换为 16kHz 单声道 WAV。

这部分可以给ai，让它生成。给它指令就是：“这是我的代码，现在用pyhub库帮我修改代码，让它能处理各种形式的音频转为16kHZ单耳道WAV“

2.实时录音

我这里用的是基于 PortAudio 库，支持多种操作系统（包括 Windows、macOS 和 Linux），能够处理包含音频信号的 NumPy 数组的sounddevice库。

3.构建mysql数据库

博主也是刚刚接触后端的东西，这部分也只是让ai写了个能用的。

注意自己要先下载mysql，网上教程很多的。

4.识别特定人的身份

用的是MFCC梅尔频率倒谱系数

5.语音转文字

我先用的whisper库，后面觉得效果不是很好，了解到whisper库主要专注于语音到文本的转换，情感识别不是它主要功能。所以换了SenseVoice Small模型。

6.情感分析输出

emotion2vec_plus_base模型识别音频输出

7.用户界面

这里只用了tkinter,matplotlib库来简单构建。用librosa,numpy,prettylable,threeading库来呈现情感结果。

8.管理文件包

因为代码都被写在一个py文件中被老师说了，所以这个项目了解了一下用setuptools构建框架。

二.代码部分

1.项目结构

EmotionAnalysis/
│
├── EmotionAnalysis/ # 项目代码目录
│ ├── __init__.py
│ ├── main.py # 主程序入口
│ ├── utils.py # 辅助工具模块
│ └── models.py # 模型定义模块
│
├── tests/ # 测试代码目录
│ ├── __init__.py
│ └── test_main.py
│
├── setup.py # setuptools 配置文件
├── README.md # 项目说明文件
├── requirements.txt # 依赖项文件
└── data/ # 数据文件目录

2.创建项目文件和目录结构

在终端中运行以下命令：

mkdir -p EmotionAnalysis/EmotionAnalysis
mkdir -p EmotionAnalysis/tests
mkdir -p EmotionAnalysis/data
touch EmotionAnalysis/__init__.py
touch EmotionAnalysis/main.py
touch EmotionAnalysis/utils.py
touch EmotionAnalysis/models.py
touch EmotionAnalysis/tests/__init__.py
touch EmotionAnalysis/tests/test_main.py
touch EmotionAnalysis/setup.py
touch EmotionAnalysis/README.md
touch EmotionAnalysis/requirements.txt

3.配置 MySQL 用户和数据库

注意：（在终端运行）

登录到 MySQL：

mysql -u root -p

输入密码后，进入 MySQL 命令行。

创建emotion_analysis_db:

CREATE DATABASE emotion_analysis_db;

4.编写 `main.py` 文件

import tkinter as tk
from tkinter import filedialog, messagebox
from EmotionAnalysis.utils import connect_to_database, create_table, insert_analysis_result
from EmotionAnalysis.models import emotion_model, sense_voice_model, convert_to_wav, generate_text
from EmotionAnalysis.utils import plot_confusion_matrix

class EmotionApp(tk.Tk):
    def __init__(self):
        super().__init__()
        self.title("情绪识别与心理分析")
        self.geometry("900x900")

        # 初始化状态变量
        self.wav_file = None
        self.asr_text = ""
        self.last_emotion_data = None

        # 加载模型
        self.emotion_model = emotion_model
        self.sense_voice_model = sense_voice_model  # 保留 SenseVoiceSmall 模型

        # 创建 UI
        self.create_widgets()
        # 连接到数据库并创建表
        self.conn = connect_to_database()
        create_table(self.conn)

    def create_widgets(self):
        # 模式选择
        mode_frame = tk.LabelFrame(self, text="选择模式")
        mode_frame.pack(padx=10, pady=5, fill="x")

        self.mode_var = tk.StringVar(value="file")
        tk.Radiobutton(mode_frame, text="使用已有音频文件", variable=self.mode_var, value="file").pack(anchor='w')
        tk.Radiobutton(mode_frame, text="实时录音", variable=self.mode_var, value="record").pack(anchor='w')

        # 文件路径输入
        file_frame = tk.Frame(self)
        file_frame.pack(padx=10, pady=5, fill="x")

        self.file_entry = tk.Entry(file_frame, width=50)
        self.file_entry.pack(side="left", expand=True, fill="x")
        tk.Button(file_frame, text="浏览", command=self.select_file).pack(side="left", padx=5)

        # 执行按钮
        tk.Button(self, text="开始分析", command=self.start_analysis).pack(pady=10)

        # 输出区域
        self.result_text = tk.Text(self, height=40, width=80)
        self.result_text.pack(padx=10, pady=5)

        # 波形图按钮
        self.plot_button = tk.Button(self, text="显示波形图", command=self.plot_waveform, state=tk.DISABLED)
        self.plot_button.pack(pady=5)

        # 情绪饼图按钮
        self.pie_button = tk.Button(self, text="显示情绪分布图", command=self.plot_emotion_pie_ui, state=tk.DISABLED)
        self.pie_button.pack(pady=5)

    def select_file(self):
        file_path = filedialog.askopenfilename(
            filetypes=[
                ("Audio Files", "*.wav *.mp3 *.flac *.ogg *.m4a"),
                ("WAV files", "*.wav"),
                ("MP3 files", "*.mp3"),
                ("FLAC files", "*.flac"),
                ("OGG files", "*.ogg"),
                ("M4A files", "*.m4a")
            ]
        )
        if file_path:
            self.file_entry.delete(0, tk.END)
            self.file_entry.insert(0, file_path)

    def start_analysis(self):
        # 清空结果和图表按钮状态
        self.result_text.delete(1.0, tk.END)
        self.plot_button.config(state=tk.DISABLED)
        self.pie_button.config(state=tk.DISABLED)

        # 启动后台线程处理
        threading.Thread(target=self.perform_analysis).start()

    def perform_analysis(self):
        try:
            # 获取音频文件路径
            mode = self.mode_var.get()
            if mode == "file":
                self.wav_file = self.file_entry.get()
                if not self.wav_file or not os.path.exists(self.wav_file):
                    self.after(0, lambda: messagebox.showerror("错误", "请输入有效的音频文件路径"))
                    return

                # 转换为 WAV 格式
                if not self.wav_file.lower().endswith(".wav"):
                    self.status("正在转换音频格式...")
                    converted_file = convert_to_wav(self.wav_file)
                    if not converted_file:
                        self.after(0, lambda: messagebox.showerror("错误", "音频格式转换失败"))
                        return
                    self.wav_file = converted_file

            elif mode == "record":
                duration = 5
                fs = 16000
                self.status("开始录音...")
                audio = sd.rec(int(duration * fs), samplerate=fs, channels=1)
                sd.wait()
                self.status("录音结束")
                self.wav_file = "temp_recording.wav"
                wavio.write(self.wav_file, audio, fs, sampwidth=2)

            # 语音识别
            try:
                speech, _ = soundfile.read(self.wav_file)
                asr_res = self.sense_voice_model.generate(input=speech)  # 使用 SenseVoiceSmall 模型进行语音识别
                self.asr_text = asr_res[0]['text'] if asr_res and len(asr_res) > 0 else "未识别到有效文本"
            except Exception as e:
                self.asr_text = "语音识别失败"
                self.after(0, lambda: messagebox.showerror("错误", f"语音识别失败: {str(e)}"))
                return

            # 情绪识别
            try:
                res = self.emotion_model.generate(
                    self.wav_file,
                    output_dir="./outputs",
                    granularity="utterance",
                    extract_embedding=False
                )
                if not res or len(res) == 0:
                    self.after(0, lambda: messagebox.showerror("错误", "未能获取情绪识别结果"))
                    return
                result_data = res[0]
            except Exception as e:
                self.after(0, lambda: messagebox.showerror("错误", f"情绪识别失败: {str(e)}"))
                return
                # 情绪识别
            try:
                res = self.emotion_model.generate(
                    self.wav_file,
                    output_dir="./outputs",
                    granularity="utterance",
                    extract_embedding=False
                    )
                if not res or len(res) == 0:
                    self.after(0, lambda: messagebox.showerror("错误", "未能获取情绪识别结果"))
                    return
                result_data = res[0]
            except Exception as e:
                self.after(0, lambda: messagebox.showerror("错误", f"情绪识别失败: {str(e)}"))
                return
            # 格式化情绪结果
            formatted_result = self.format_results(result_data)
            emotion_labels = result_data['labels']
            emotion_scores = result_data['scores']

            # 构建情绪详情字符串
            THRESHOLD = 0.01
            emotion_details = "、".join([
                f"{label}({score * 100:.1f}%)" if score >= THRESHOLD else f"{label}(<0.1%)"
                for label, score in zip(emotion_labels, emotion_scores)
            ])

            # 构造提示词
            prompt = (
                f"用户语音内容为：{self.asr_text}\n"
    f"当前的情绪状态包含以下成分：{emotion_details}。\n"
    "请根据这些信息，进行综合的心理分析与评估。\n"
    "要求如下：\n"
    "1. 分析语音内容所表达的核心含义或意图。请关注语音中的关键词和语句结构，推断用户可能想要表达的主要思想或需求。\n"
    "2. 解释每种情绪可能对应的心理状态或诱因。结合情绪标签和语音内容，分析每种情绪的可能来源和背后的心理动机。\n"
    "3. 结合语义和情绪判断整体心理状态。综合语音内容和情绪分析结果，评估用户当前的整体心理状态，包括情绪稳定性、心理压力水平等。\n"
    "4. 提供具体的建议帮助用户改善心理状态。根据分析结果，给出针对性的建议，帮助用户缓解负面情绪或增强积极情绪。建议应具体、可行，避免过于笼统。\n"
    "5. 语言应保持温和、支持性，避免主观臆断。分析结果应基于数据和事实，避免过度解读或主观臆断，确保语言温和、支持性。\n"
    "请以中文输出分析结果，结构清晰，便于用户理解。\n"
    "注意事项：\n"
    "1. 请确保分析结果基于当前输入的语音和情绪数据，避免引用或提及历史分析结果。\n"
    "2. 请确保分析结果客观、中立，避免带有个人情感色彩。\n"
    "3. 请确保分析结果具有可操作性，用户可以根据建议采取行动。\n"
            )

            # 调用大模型 API
            try:
                psychological_analysis = generate_text(prompt)
            except Exception as e:
                psychological_analysis = f"心理分析失败: {str(e)}"

            # 显示完整结果
            full_result = str(formatted_result) + "\n\n语音内容：" + self.asr_text + "\n\n心理分析:\n" + psychological_analysis
            self.after(0, self.update_ui_with_results, full_result, emotion_labels, emotion_scores)

        finally:
            # 确保清理操作始终执行
            plt.close('all')  # 关闭所有图表窗口

    def update_ui_with_results(self, full_result, emotion_labels, emotion_scores):
        self.result_text.delete(1.0, tk.END)
        self.result_text.insert(tk.END, full_result)
        self.last_emotion_data = (emotion_labels, emotion_scores)
        self.plot_button.config(state=tk.NORMAL)
        self.pie_button.config(state=tk.NORMAL)

    def format_results(self, result):
        labels = result['labels']
        scores = result['scores']

        table = PrettyTable()
        table.field_names = ["预测种类", "概率"]

        for label, score in zip(labels, scores):
            table.add_row([label, f"{score:.6f}"])

        return table

    def plot_waveform(self):
        if not self.wav_file:
            return

        try:
            data, sr = librosa.load(self.wav_file, sr=None)
        except Exception as e:
            messagebox.showerror("错误", f"无法加载音频文件: {e}")
            return

        plt.figure(figsize=(10, 4))
        plt.plot(data)
        plt.title("音频波形图")
        plt.xlabel("样本点")
        plt.ylabel("振幅")
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    def plot_emotion_pie_ui(self):
        if hasattr(self, 'last_emotion_data'):
            labels, scores = self.last_emotion_data
            self.plot_emotion_pie(labels, scores)
        else:
            messagebox.showerror("错误", "没有可用的情绪数据")

    def plot_emotion_pie(self, labels, scores):
        THRESHOLD = 0.01
        filtered_data = [(label, score) for label, score in zip(labels, scores) if score >= THRESHOLD]

        if not filtered_data:
            messagebox.showwarning("警告", "没有足够强度的情绪数据来生成饼图")
            return

        labels_filtered, scores_filtered = zip(*filtered_data)

        plt.figure(figsize=(6, 6))
        plt.pie(scores_filtered, labels=labels_filtered, autopct='%1.1f%%', startangle=140)
        plt.title('情绪分布')
        plt.axis('equal')
        plt.tight_layout()
        plt.show()

    def status(self, msg):
        print(msg)

if __name__ == "__main__":
    app = EmotionApp()
    app.mainloop()

5.编写 `utils.py` 文件

import mysql.connector
from mysql.connector import Error
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report
import librosa
import soundfile
import sounddevice as sd
import wavio
from prettytable import PrettyTable

# 数据库连接配置
db_config = {
    'host': 'localhost',
    'user': 'root',
    'password': '273314',
    'database': 'emotion_analysis_db'
}

# 连接到 MySQL 数据库
def connect_to_database():
    try:
        connection = mysql.connector.connect(**db_config)
        if connection.is_connected():
            print("成功连接到 MySQL 数据库")
            return connection
    except Error as e:
        print(f"连接 MySQL 时出错: {e}")
        return None

# 创建表（如果尚未创建）
def create_table(conn):
    cursor = conn.cursor()
    create_table_query = """
    CREATE TABLE IF NOT EXISTS emotion_analysis (
        id INT AUTO_INCREMENT PRIMARY KEY,
        audio_file VARCHAR(255) NOT NULL,
        asr_text TEXT NOT NULL,
        emotion_labels TEXT NOT NULL,
        emotion_scores TEXT NOT NULL,
        psychological_analysis TEXT NOT NULL,
        created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
    )
    """
    cursor.execute(create_table_query)
    conn.commit()
    cursor.close()

# 将结果存入数据库
def insert_analysis_result(conn, audio_file, asr_text, emotion_labels, emotion_scores, psychological_analysis):
    cursor = conn.cursor()
    insert_query = """
    INSERT INTO emotion_analysis (audio_file, asr_text, emotion_labels, emotion_scores, psychological_analysis)
    VALUES (%s, %s, %s, %s, %s)
    """
    cursor.execute(insert_query, (audio_file, asr_text, emotion_labels, emotion_scores, psychological_analysis))
    conn.commit()
    cursor.close()


def convert_to_wav(input_path, output_path="temp_audio.wav"):
    """将任意格式音频转换为 16kHz 单声道 WAV"""
    try:
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(16000).set_channels(1)
        audio.export(output_path, format="wav")
        return output_path
    except Exception as e:
        print(f"音频转换失败: {e}")
        return None

6. 编写 `models.py` 文件

注意：里面的api和url要自己去阿里百练平台申请一下。

from funasr import AutoModel

# 加载模型
emotion_model = AutoModel(model="iic/emotion2vec_plus_base")
sense_voice_model = AutoModel(model="iic/SenseVoiceSmall")

def generate_text(prompt):
    """调用 DashScope 的 DeepSeek 模型生成心理分析"""
    try:
        client = OpenAI(
            api_key='your_api_key',
            base_url="your_base_url"
        )
        response = client.chat.completions.create(
            model="deepseek-v3",
            messages=[
                {"role": "system", "content": "你是一个专业的心理咨询师，擅长情绪分析与心理疏导"},
                {"role": "user", "content": prompt},
            ],
            stream=False
        )
        return response.choices[0].message.content
    except Exception as e:
        print("调用大模型失败:", str(e))
        return f"心理分析失败: {str(e)}"

7.编写 `setup.py` 文件

from setuptools import setup, find_packages

setup(
    name="EmotionAnalysis",  # 项目名称
    version="0.1.0",         # 项目版本
    author="Your Name",      # 作者名称
    author_email="your.email@example.com",  # 作者邮箱
    description="A brief description of the project",  # 项目描述
    long_description=open("README.md").read(),  # 项目详细描述
    long_description_content_type="text/markdown",  # 描述内容类型
    url="https://github.com/yourusername/EmotionAnalysis",  # 项目主页
    packages=find_packages(),  # 自动查找项目中的包
    install_requires=[  # 项目依赖项
        "numpy",
        "matplotlib",
        "librosa",
        "pydub",
        "openai",
        "mysql-connector-python",
        "prettytable",
        "scikit-learn",
        "sounddevice",
        "wavio"
    ],
    classifiers=[  # 项目分类
        "Programming Language :: Python :: 3",
        "License :: OSI Approved :: MIT License",
        "Operating System :: OS Independent",
    ],
    python_requires='>=3.6',  # Python 版本要求
    entry_points={  # 定义可执行脚本
        'console_scripts': [
            'emotion_analysis=EmotionAnalysis.main:main',
        ],
    },
)