某天我在看电影的时候发现了某部电影只有英文字幕,找了半天没找到合适的中文字幕,便想着自己制作中英对照字幕。初步设计为3步:1、提取内嵌英文字幕。2、获取字幕的翻译。3、将翻译填入字幕文件。最后成功获取中英文对照字幕。
1、提取内嵌英文字幕
安装并配置ffmpeg
https://www.ffmpeg.org/download.html
安装后用python调用,代码如下
def get_srt(file, outfile):
cmd = "ffmpeg-2021-10-21-git-2aa343bb6f-full_build\\bin\\ffmpeg" + ' -i ' + file + ' -map 0:s:0 ' + outfile
os.system(cmd)
2.获取字幕的翻译
由于字幕文本过大,无法使用免费的翻译接口,故选择selenium大法控制百度来翻译,selenium配置方法如下:
https://blog.csdn.net/tk1023/article/details/109078613
配置完成后用python封装:
from selenium import webdriver
import xlwt
import time
import sys
import shutil
import os
import math
class Browser(object):
def __init__(self, xls_name='https://fanyi.baidu.com/?aldtype=16047#auto/zh'):
self.xls_name = xls_name
self.browser = webdriver.Edge()
def tran(self, str):
self.browser.get("https://fanyi.baidu.com/?aldtype=16047#en/zh/" + str) # 打开翻译主页
for i in range(10):
time.sleep(1)
b = self.browser.find_element_by_xpath(
"/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p [2]/span")
c = b.text # 提取翻译结果
if c.strip() != '':
break
return c
3.将翻译填入字幕文件
这步比较简单,根据字幕文本写对应python脚本
def add_eng(src, dst):
state = 'null'
box1 = []
box2 = []
eng = ''
b = Browser()
with open(src) as f:
for line in f.readlines():
if state == 'null':
state = 'get_count'
box1.append(line)
box2.append(line)
elif state == 'get_count':
state = 'get_time'
box1.append(line)
box2.append(line)
elif state == 'get_time':
if len(line.strip()) > 0:
eng += line.strip() + ' '
box1.append(line)
else:
state = 'null'
try:
cn = b.tran(eng)
except:
cn = ''
box2.append(cn)
with open(dst, 'a+') as frr:
for i in box1:
frr.write(i)
frr.write('\n')
for i in box2:
frr.write(i)
frr.write('\n\n')
box1 = []
box2 = []
eng = ''
结果
原始字幕:
1
00:00:55,255 --> 00:00:57,557
My planet
Arrakis is so beautiful
2
00:00:57,624 --> 00:00:59,192
when the sun is low.
生成字幕:
1
00:00:55,255 --> 00:00:57,557
My planet
Arrakis is so beautiful
1
00:00:55,255 --> 00:00:57,557
我的星球阿拉基斯是如此美丽
2
00:00:57,624 --> 00:00:59,192
when the sun is low.
2
00:00:57,624 --> 00:00:59,192
当太阳低的时候。
整体代码
trans.py:
from selenium import webdriver
import xlwt
import time
import sys
import shutil
import os
import math
class Browser(object):
def __init__(self, xls_name='https://fanyi.baidu.com/?aldtype=16047#auto/zh'):
self.xls_name = xls_name
self.browser = webdriver.Edge()
def tran(self, str):
self.browser.get("https://fanyi.baidu.com/?aldtype=16047#en/zh/" + str) # 打开翻译主页
for i in range(10):
time.sleep(1)
b = self.browser.find_element_by_xpath(
"/html/body/div[1]/div[3]/div/div/div[1]/div[2]/div[1]/div[2]/div/div/div[1]/p [2]/span")
c = b.text # 提取翻译结果
if c.strip() != '':
break
return c
main.py:
import os
import json
import random
import hashlib
import time
import re
from trans import Browser
def get_srt(file, outfile):
cmd = "ffmpeg-2021-10-21-git-2aa343bb6f-full_build\\bin\\ffmpeg" + ' -i ' + file + ' -map 0:s:0 ' + outfile
os.system(cmd)
def add_eng(src, dst):
state = 'null'
box1 = []
box2 = []
eng = ''
b = Browser()
with open(src) as f:
for line in f.readlines():
if state == 'null':
state = 'get_count'
box1.append(line)
box2.append(line)
elif state == 'get_count':
state = 'get_time'
box1.append(line)
box2.append(line)
elif state == 'get_time':
if len(line.strip()) > 0:
eng += line.strip() + ' '
box1.append(line)
else:
state = 'null'
try:
cn = b.tran(eng)
except:
cn = ''
box2.append(cn)
with open(dst, 'a+') as frr:
for i in box1:
frr.write(i)
frr.write('\n')
for i in box2:
frr.write(i)
frr.write('\n\n')
box1 = []
box2 = []
eng = ''
file = r'xxx.mkv'
mid = 'subs.srt'
dst = 'new.srt'
get_srt(file, mid)
add_eng(mid, dst)