import json
import re
def optimize_segmentation(input_file, output_file):
"""优化版字幕分段处理器:按标点智能分割并保持时间连续性"""
try:
# 读取原始数据
with open(input_file, 'r', encoding='utf-8') as f:
raw_data = json.load(f)
processed_data = []
for idx, item in enumerate(raw_data, 1):
# 数据验证
if len(item) !=3 or not all([isinstance(item[0], int),
isinstance(item[1], int),
isinstance(item[2], str)]):
print(f"警告:忽略第{idx}条异常数据 {item}")
continue
start, end, text = item
total_duration = end - start
# 智能分段逻辑
segments = re.split(r'(?<=[。!?;])[ \t]*', text.strip())
segments = [s for s in segments if s]
# 时间分配算法
segment_entries = []
time_cursor = start
for i, seg in enumerate(segments):
# 计算相对时长权重
weight = len(seg) / len(text) if text else 1/len(segments)
duration = int(total_duration * weight)
# 确保时间不越界
seg_end = time_cursor + duration
if i == len(segments)-1 or seg_end > end:
seg_end = end
segment_entries.append([
int(time_cursor),
int(seg_end),
seg
])
time_cursor = seg_end
processed_data.extend(segment_entries)
# 保存优化结果
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(processed_data, f, ensure_ascii=False, indent=2)
return True
except Exception as e:
print(f"处理失败:{str(e)}")
return False 按照示例要求优化以上代码 [
[380, 7560, "我是佣人的女儿,可从我看到她的第一眼起,就无可救药的爱上了她。"]
]
# 执行代码
# 预期输出结构
[
[380, 2773, "我是佣人的女儿"],
[2773, 5166, "可从我看到她的第一眼起"],
[5166, 7560, "就无可救药的爱上了她。"]
]