自动翻译视频字幕

最新推荐文章于 2024-07-22 15:06:37 发布

猪肉炖派蒙

最新推荐文章于 2024-07-22 15:06:37 发布

阅读量3k

点赞数 1

文章标签： ffmpeg selenium 百度翻译字幕处理中英对照

本文链接：https://blog.csdn.net/zw110119/article/details/120959609

版权

某天我在看电影的时候发现了某部电影只有英文字幕，找了半天没找到合适的中文字幕，便想着自己制作中英对照字幕。初步设计为3步：1、提取内嵌英文字幕。2、获取字幕的翻译。3、将翻译填入字幕文件。最后成功获取中英文对照字幕。

1、提取内嵌英文字幕
安装并配置ffmpeg
https://www.ffmpeg.org/download.html
安装后用python调用，代码如下

def get_srt(file, outfile):
    cmd = "ffmpeg-2021-10-21-git-2aa343bb6f-full_build\\bin\\ffmpeg" + ' -i ' + file + ' -map 0:s:0 ' + outfile
    os.system(cmd)

2.获取字幕的翻译
由于字幕文本过大，无法使用免费的翻译接口，故选择selenium大法控制百度来翻译，selenium配置方法如下:
https://blog.csdn.net/tk1023/article/details/109078613
配置完成后用python封装:

from selenium import webdriver
	import xlwt
	import time
	import sys
	import shutil
	import os
	import math
	
	
	class Browser(object):
	    def __init__(self, xls_name='https://fanyi.baidu.com/?aldtype=16047#auto/zh'):
	        self.xls_name = xls_name
	        self.browser = webdriver.Edge()
	
	    def tran(self, str):
	        self.browser.get("https://fanyi.baidu.com/?aldtype=16047#en/zh/" + str)  # 打开翻译主页
	        for i in range(10):
	            time.sleep(1)
	            b = self.browser.find_element_by_xpath(
	                "/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p [2]/span")
	            c = b.text  # 提取翻译结果
	            if c.strip() != '':
	                break
	        return c

3.将翻译填入字幕文件
这步比较简单，根据字幕文本写对应python脚本

def add_eng(src, dst):
    state = 'null'
    box1 = []
    box2 = []
    eng = ''

    b = Browser()
    with open(src) as f:
        for line in f.readlines():
            if state == 'null':
                state = 'get_count'
                box1.append(line)
                box2.append(line)
            elif state == 'get_count':
                state = 'get_time'
                box1.append(line)
                box2.append(line)
            elif state == 'get_time':
                if len(line.strip()) > 0:
                    eng += line.strip() + ' '
                    box1.append(line)
                else:
                    state = 'null'
                    try:
                        cn = b.tran(eng)
                    except:
                        cn = ''
                    box2.append(cn)
                    with open(dst, 'a+') as frr:
                        for i in box1:
                            frr.write(i)
                        frr.write('\n')
                        for i in box2:
                            frr.write(i)
                        frr.write('\n\n')
                    box1 = []
                    box2 = []
                    eng = ''

结果
原始字幕：
1
00:00:55,255 --> 00:00:57,557
My planet
Arrakis is so beautiful

2
00:00:57,624 --> 00:00:59,192
when the sun is low.

生成字幕：
1
00:00:55,255 --> 00:00:57,557
My planet
Arrakis is so beautiful

1
00:00:55,255 --> 00:00:57,557
我的星球阿拉基斯是如此美丽

2
00:00:57,624 --> 00:00:59,192
when the sun is low.

2
00:00:57,624 --> 00:00:59,192
当太阳低的时候。

整体代码

trans.py:

from selenium import webdriver
import xlwt
import time
import sys
import shutil
import os
import math


class Browser(object):
    def __init__(self, xls_name='https://fanyi.baidu.com/?aldtype=16047#auto/zh'):
        self.xls_name = xls_name
        self.browser = webdriver.Edge()

    def tran(self, str):
        self.browser.get("https://fanyi.baidu.com/?aldtype=16047#en/zh/" + str)  # 打开翻译主页
        for i in range(10):
            time.sleep(1)
            b = self.browser.find_element_by_xpath(
                "/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p [2]/span")
            c = b.text  # 提取翻译结果
            if c.strip() != '':
                break
        return c

main.py:

import os
import json
import random
import hashlib
import time
import re
from trans import Browser


def get_srt(file, outfile):
    cmd = "ffmpeg-2021-10-21-git-2aa343bb6f-full_build\\bin\\ffmpeg" + ' -i ' + file + ' -map 0:s:0 ' + outfile
    os.system(cmd)


def add_eng(src, dst):
    state = 'null'
    box1 = []
    box2 = []
    eng = ''

    b = Browser()
    with open(src) as f:
        for line in f.readlines():
            if state == 'null':
                state = 'get_count'
                box1.append(line)
                box2.append(line)
            elif state == 'get_count':
                state = 'get_time'
                box1.append(line)
                box2.append(line)
            elif state == 'get_time':
                if len(line.strip()) > 0:
                    eng += line.strip() + ' '
                    box1.append(line)
                else:
                    state = 'null'
                    try:
                        cn = b.tran(eng)
                    except:
                        cn = ''
                    box2.append(cn)
                    with open(dst, 'a+') as frr:
                        for i in box1:
                            frr.write(i)
                        frr.write('\n')
                        for i in box2:
                            frr.write(i)
                        frr.write('\n\n')
                    box1 = []
                    box2 = []
                    eng = ''


file = r'xxx.mkv' 
mid = 'subs.srt'
dst = 'new.srt'

get_srt(file, mid)
add_eng(mid, dst)