import requests
import time
import os
import subprocess
from pydub import AudioSegment
from question365 import _20250226_questions,_20250226_question_folder
from io import BytesIO
# 定义 API 地址
COSYVOICE_API = "http://127.0.0.1:8000/generate_audio/"
# api_url = "http://127.0.0.1:8000/generate_audio/"
VOICE_PROMPT_MAPPING = {
"zh-CN-datudou": "/home/kt/myproject/CosyVoice/asset/datudou.wav",
"zh-CN-datudou-prompt":"现在所有的选手都听得见。我想对他们说:这是你们的舞台,期待他们释放真实力,让热爱绽放,为梦想起航。"
}
# 定义重试次数和等待时间
MAX_RETRIES = 100
RETRY_DELAY = 5 # 等待时间(秒)
story_name = _20250226_question_folder
now_questions= _20250226_questions
def clean_text(text):
# 替换三个“-”为空字符串
text = text.replace("---", "")
# 替换“*”为空字符串
text = text.replace("*", "")
# 删除空行和换行符
lines = text.splitlines()
cleaned_lines = [line.strip() for line in lines if line.strip()]
cleaned_text = ''.join(cleaned_lines)
return cleaned_text
def generate_story(prompt, story_name, title, save_dir="stories"):
# 确保存储目录存在
now_dir = f"{save_dir}/{story_name}"
os.makedirs(now_dir, exist_ok=True)
# API调用
response = requests.post(
"http://localhost:11434/api/generate",
json={
"model": "science-story:latest",
"prompt": f"[儿童故事需求]\n{prompt}",
"stream": False,
"options": {
"temperature": 0.3,
"num_predict": 1800
}
}
)
# 处理响应
if response.status_code == 200:
story_text = response.json()["response"]
filename = f"{now_dir}/{title}.txt"
# 标准化保存格式
with open(filename, "w", encoding="utf-8") as f:
f.write(story_text)
print(f"✅ 成功生成故事:{filename}")
return filename
else:
print(f"❌ 生成失败,状态码:{response.status_code}")
return None
# def convert_text_to_audio(text, audio_file):
# for attempt in range(MAX_RETRIES):
# temp_audio_file = f"{audio_file}_temp.mp3"
# # 定义请求体
# data = {
# "text": f"{text}",
# "prompt_text": "现在所有的选手都听得见。我想对他们说:这是你们的舞台,期待他们释放真实力,让热爱绽放,为梦想起航。",
# "prompt_wav_path": "./asset/20250226.wav",
# "output_dir": "./output"
# }
# # 发送 POST 请求
# response = requests.post(api_url, json=data,timeout=30)
# if response.status_code == 200 and os.path.exists(temp_audio_file) and os.path.getsize(temp_audio_file) > 0:
# audio_segment = AudioSegment.from_mp3(temp_audio_file)
# os.remove(temp_audio_file) # 删除临时文件
# return audio_segment
# else:
# print(f"❌ 第 {attempt + 1} 次尝试生成临时文件 {temp_audio_file} 失败")
# # print(f"❌ 错误信息: {result.stderr}")
# if attempt < MAX_RETRIES - 1:
# print(f"等待 {RETRY_DELAY} 秒后重试...")
# time.sleep(RETRY_DELAY)
#
# print(f"❌ 达到最大重试次数,无法生成音频文件")
#
# return None
# def convert_text_to_audio(text, audio_file, voice="zh-CN-datudou"):
# for attempt in range(MAX_RETRIES):
# temp_audio_file = f"{audio_file}_temp_{attempt}.mp3"
# try:
# data = {
# "text": text,
# "prompt_wav_path": VOICE_PROMPT_MAPPING[voice],
# "prompt_text":VOICE_PROMPT_MAPPING["zh-CN-datudou-prompt"],
# "output_dir": "./temp_api_output"
# }
#
# print(f"🚀 本地发送的文本,{data['text']}")
#
#
# response = requests.post(COSYVOICE_API, json=data, timeout=30)
#
# if response.status_code == 200 and response.content:
# # 直接写入内存中的MP3数据
# with open(temp_audio_file, "wb") as f:
# f.write(response.content)
#
# # 验证音频有效性
# audio_segment = AudioSegment.from_mp3(temp_audio_file)
# os.remove(temp_audio_file)
# return audio_segment
#
# except Exception as e:
# print(f"❌ 第 {attempt + 1} 次尝试失败: {str(e)}")
# if os.path.exists(temp_audio_file):
# os.remove(temp_audio_file)
#
# if attempt < MAX_RETRIES - 1:
# backoff = RETRY_DELAY * (2 ** attempt)
# print(f"等待 {backoff} 秒后重试...")
# time.sleep(backoff)
#
# print(f"❌ 达到最大重试次数,无法生成音频")
# return None
def convert_text_to_audio(text, audio_file, voice="zh-CN-datudou"):
for attempt in range(MAX_RETRIES):
temp_audio_file = f"{audio_file}_temp_{attempt}.mp3"
temp_buffer = BytesIO() # 使用内存缓冲区
text=clean_text(text)
try:
data = {
"text": text,
"prompt_wav_path": VOICE_PROMPT_MAPPING[voice],
"prompt_text": VOICE_PROMPT_MAPPING["zh-CN-datudou-prompt"],
"stream": True # 启用流式模式
}
# print(f"▶ 本地发送文本:{text[:50]}...") # 显示前50字符
# print(f"▶ 本地发送文本:{text}") # 显示前50字符
# 使用流式请求(超时设置为None由服务器控制)
with requests.post(
COSYVOICE_API,
json=data,
stream=True,
timeout=(5, None) # 连接超时5秒,接收无超时
) as response:
if response.status_code != 200:
raise Exception(f"API错误: {response.text}")
# 实时接收音频流
for chunk in response.iter_content(chunk_size=4096):
if chunk:
temp_buffer.write(chunk)
print("▌", end="", flush=True) # 进度指示
# print("\n✓ 接收完成")
# 处理内存中的音频数据
temp_buffer.seek(0)
# 保存临时文件用于验证
with open(temp_audio_file, "wb") as f:
f.write(temp_buffer.getbuffer())
# 验证并加载音频
audio_segment = AudioSegment.from_mp3(temp_audio_file)
os.remove(temp_audio_file)
return audio_segment
except requests.exceptions.RequestException as e:
print(f"❌ 第 {attempt + 1} 次尝试失败: {str(e)}")
# 清理临时文件
if os.path.exists(temp_audio_file):
os.remove(temp_audio_file)
temp_buffer.close()
# 指数退避等待
if attempt < MAX_RETRIES - 1:
backoff = RETRY_DELAY * (2 ** attempt)
print(f"⏳ 等待 {backoff} 秒后重试...")
time.sleep(backoff)
finally:
temp_buffer.close()
print(f"🛑 达到最大重试次数,无法生成音频")
return None
def convert_to_audio_with_effects(text_file):
audio_file = text_file.replace(".txt", ".mp3")
with open(text_file, "r", encoding="utf-8") as f:
text = f.read()
audio_segments = []
index = 0
while index < len(text):
# 查找音效提示的开始位置
start_index = text.find("[", index)
if start_index == -1:
# 如果没有找到音效提示,处理剩余文本
remaining_text = text[index:].strip()
if remaining_text:
audio_segment = convert_text_to_audio(remaining_text, audio_file)
if audio_segment:
audio_segments.append(audio_segment)
break
# 处理音效提示之前的文本
text_before_effect = text[index:start_index].strip()
if text_before_effect:
audio_segment = convert_text_to_audio(text_before_effect, audio_file)
if audio_segment:
audio_segments.append(audio_segment)
# 查找音效提示的结束位置
end_index = text.find("]", start_index)
if end_index != -1:
effect_name = text[start_index + 1:end_index]
if effect_name == "轻快音乐":
audio_segments.append(AudioSegment.from_mp3("light_music.mp3"))
elif effect_name == "水滴声":
audio_segments.append(AudioSegment.from_mp3("drop_sound.mp3"))
elif effect_name == "翻页声":
audio_segments.append(AudioSegment.from_mp3("page_turn_sound.mp3"))
index = end_index + 1
else:
# 如果没有找到结束位置,处理剩余文本
remaining_text = text[index:].strip()
if remaining_text:
audio_segment = convert_text_to_audio(remaining_text, audio_file)
if audio_segment:
audio_segments.append(audio_segment)
break
# 合并所有音频片段
if audio_segments:
final_audio = sum(audio_segments)
# 保存最终的音频文件
final_audio.export(audio_file, format="mp3")
return audio_file
else:
print("❌ 没有生成有效的音频片段")
return None
# 使用示例
if __name__ == "__main__":
import datetime
print(f"开始时间:{datetime.datetime.now()}")
start_time = time.time() # 记录开始时间
for title in now_questions:
start_time_ = time.time() # 记录开始时间
story_file = generate_story(f"创作解释'{title}'的故事", story_name, title)
if story_file:
audio_file = convert_to_audio_with_effects(story_file)
if audio_file:
print(f"\n✅ 成功生成音频:{audio_file}")
end_time_ = time.time() # 记录结束时间
elapsed_time_ = end_time_ - start_time_ # 计算耗时
print(f"{title}所需时间:{elapsed_time_:.2f} 秒\n")
end_time = time.time() # 记录结束时间
elapsed_time = end_time - start_time # 计算耗时
print(f"本次任务:{elapsed_time:.2f} 秒")
以上为调用了自定义的以qwen2.5 32B为低座的模型,先用ollama run 命令启动,其实不启动也行。以上代码会自动启动。
然后又使用cosyvoice的api生成语音,然后保存。速度很慢生成一个约340:140秒的速度以上生成的。使用的p102-100 10G的卡5张。cosyvoice要先启动,不然会不给启动。
生成100个儿童故事,要10个小时。也不知能不能够电费钱。
原来是6张的,又一张坏了。唉。没钱。
下面的代码是以fastapi结合cosyvoice的代码。
上面的代码以流式的方式访问下面的代码,如果不用这种方式,不太好控制。感觉流式很好。
import sys
sys.path.append('third_party/Matcha-TTS')
from cosyvoice.cli.cosyvoice import CosyVoice, CosyVoice2
from cosyvoice.utils.file_utils import load_wav
import torchaudio
import onnxruntime as ort
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import os
import torch
import io
from io import BytesIO
from fastapi.responses import StreamingResponse
from pydub import AudioSegment
# 初始化 FastAPI 应用
app = FastAPI()
# # 设置使用 GPU
# ort.set_default_logger_severity(3) # 减少日志输出
# providers = ort.get_available_providers()
# print("Available providers:", providers) # 打印可用的 providers
#
# # 确保 CUDAExecutionProvider 可用
# if 'CUDAExecutionProvider' not in providers:
# raise RuntimeError("CUDAExecutionProvider is not available. Please ensure you have installed onnxruntime-gpu.")
# GPU资源配置
os.environ["CUDA_VISIBLE_DEVICES"] = "4" # 固定使用第一个GPU
# torch.cuda.set_per_process_memory_fraction(0.8) # 限制显存使用80%
# 初始化 CosyVoice2 模型
cosyvoice = CosyVoice2(
'/home/kt/myproject/CosyVoice/pretrained_models/CosyVoice2-0.5B',
load_jit=False,
load_trt=False,
fp16=False
)
# 定义请求体模型
class TTSRequest(BaseModel):
text: str
prompt_text: str="现在所有的选手都听得见。我想对他们说:这是你们的舞台,期待他们释放真实力,让热爱绽放,为梦想起航。",
prompt_wav_path: str = "./asset/20250226.wav"
output_dir: str = "./output"
# 定义生成音频的 API
@app.post("/generate_audio/")
async def generate_audio(request: TTSRequest):
try:
# 加载prompt音频
prompt_speech_16k = load_wav(request.prompt_wav_path, 16000)
# 生成流式响应
def generate_audio_stream():
audio_buffer = BytesIO()
for i, chunk in enumerate(cosyvoice.inference_zero_shot(
request.text,
request.prompt_text,
prompt_speech_16k,
stream=True, # 启用流式
speed=0.6, # 语速
)):
# 处理音频块
temp_wav = BytesIO()
torchaudio.save(temp_wav, chunk['tts_speech'], cosyvoice.sample_rate, format='wav')
temp_wav.seek(0)
# 转换为MP3
segment = AudioSegment.from_wav(temp_wav)
mp3_data = BytesIO()
segment.export(mp3_data, format="mp3", bitrate="64k")
yield mp3_data.getvalue()
print(f"Sent chunk {i + 1}") # 调试日志
return StreamingResponse(
generate_audio_stream(),
media_type="audio/mpeg",
headers={
"Content-Disposition": "attachment; filename=stream_audio.mp3",
"X-Audio-Stream": "true"
}
)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# 启动 FastAPI 服务
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
记录加使用吧。上面其实还有很多可以优化的地方。后期听听别人的有声书,看看能完善到什么情况。我感觉应该是故事类的听众要多一起。