最近在研究将图片和文本批量合成为带字幕口播视频
主要是基于python的moviepy库
from generator import audio, pics, subs, video
def main():
texts_input = 'example'
pics_input = 'example'
# 图片分辨率预处理
pics.adjust(pics_input)
# 文字转语音
audio.text_to_audio(texts_input)
# 语音转视频
video.audio_to_video(texts_input, pics_input)
# 生成字幕
subs.download_subs(texts_input)
# 生成字幕视频
video.attach_subs(texts_input)
if __name__ == "__main__":
main()
以下是核心的图片+文本转视频逻辑
import json
from mutagen.mp3 import MP3
from moviepy import editor
from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip
import generator.pics as pics
def audio_to_video(text_input, pics_input):
audio_filepath = './output/audios/' + text_input.replace(' ', '_') + '.mp3'
video_filepath = './outpu