环境准备
首先准备一台显存>=12G的服务器,我这里选用的是4090 24G云服务器
接着先拉取官方代码,然后创建一个虚拟环境,再安装其对应的依赖库
git clone https://github.com/stepfun-ai/Step-Audio.git
conda create -n stepaudio python=3.10
conda activate stepaudio
cd Step-Audio
pip install -r requirements.txt
接着下载部署所需的模型,由于我们仅为部署Step-Audio-TTS-3B模型,所以并不需要全部下载所有模型,仅需下载Step-Audio-Tokenizer和Step-Audio-TTS-3B模型即可。
模型下载上,我们可以从ModelScope上仅需下载,首先需要安装一下其下载器,然后前往对应模型页,即可复制并下载。
pip install modelscope
modelscope download --model stepfun-ai/Step-Audio-TTS-3B --local_dir ./model/Step-Audio-TTS-3B
modelscope download --model stepfun-ai/Step-Audio-Tokenizer --local_dir ./model/Step-Audio-Tokenizer
脚本部署
这里已经准备好了部署脚本,仅需要改变自己的密钥和ip、端口,即可直接部署,快速使用。
import os
import uuid
import time
import asyncio
from enum import Enum
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, status, Depends, Request
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from fastapi.staticfiles import StaticFiles
from pydantic import BaseModel, Field, validator, ValidationError, field_validator
from typing import Optional, Dict, List
from loguru import logger
import torchaudio
import aiohttp
import base64
from tts import StepAudioTTS
from tokenizer import StepAudioTokenizer
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
from fastapi.responses import JSONResponse
from fastapi.exceptions import RequestValidationError
# 初始化认证密钥
security = HTTPBearer(auto_error=False)
valid_api_keys = {"你的密钥"} # 从环境变量读取更安全
# 任务状态枚举
class TaskStatus(str, Enum):
IN_QUEUE = "InQueue"
IN_PROGRESS = "InProgress"
SUCCEEDED = "Succeed"
FAILED = "Failed"
CANCELLED = "Cancelled"
# 任务存储结构
class TaskData(BaseModel):
status: TaskStatus = TaskStatus.IN_QUEUE
audio_type: str # "common", "music", "clone"
params: dict
created_at: int = Field(default_factory=lambda: int(time.time()))
started_at: Optional[int] = None
completed_at: Optional[int] = None
download_url: Optional[str] = None
reason: Optional[str] = None
# 生命周期管理
@asynccontextmanager
async def lifespan(app: FastAPI):
try:
# 初始化模型
model_path = os.getenv("MODEL_PATH", "/data/coding/model")
app.state.encoder = StepAudioTokenizer(os.path.join(model_path, "Step-Audio-Tokenizer"))
app.state.tts_engine = StepAudioTTS(os.path.join(model_path, "Step-Audio-TTS-3B"), app.state.encoder)
# 初始化任务系统
app.state.tasks: Dict[str, TaskData] = {}
app.state.pending_queue: List[str] = []
app.state.task_lock = asyncio.Lock()
app.state.semaphore = asyncio.Semaphore(5) # 并发数限制(增加到5)
app.state.base_url = "ip+端口"
# 启动后台处理器
asyncio.create_task(task_processor())
print("✅ 应用初始化完成")
yield
finally:
# 清理资源
app.state.encoder = None
app.state.tts_engine = None
torch.cuda.empty_cache()
app = FastAPI(lifespan=lifespan)
app.mount("/static", StaticFiles(directory="static"), name="static")
# 请求模型
class CommonTTSRequest(BaseModel):
text:str = Field(..., min_length=1, description="需要合成的文本内容")
speaker: str = Field(default="Tingting", description="仅支持 Tingting")
emotion: Optional[str] = Field(None, description="可选值: 高兴1, 高兴2, 生气1, 生气2, 悲伤1, 撒娇1")
language: Optional[str] = Field(None, description="可选值: 中文, 英文, 韩语, 日语, 四川话, 粤语, 广东话")
speed: Optional[str] = Field(None, description="可选值: 慢速1, 慢速2, 快速1, 快速2")
@field_validator("speaker")
def validate_speaker(cls, v):
if v not in ["Tingting"]:
raise ValueError(f"不支持的 speaker: {v}")
return v
@field_validator("emotion")