import fitz# 导入fitz库,用于处理PDF文档
import re# 导入re库,用于正则表达式匹配
import os
import re
from Baidu_Text_transAPI_2 import baidu_api_fanyi
from docx.shared import Cm, RGBColor
import docx
from docx.shared import Pt
from docx.oxml.ns import qn
from PIL import Image
import imagehash
import numpy as np
from PIL import Image
import docx
from openpyxl import load_workbook
import pandas as pd
from datetime import datetime
from openpyxl import Workbook
import global_num
from fydf_ui import Ui_Form
import sys
import os
from PyQt5.QtWidgets import QWidget, QFileDialog,QApplication,QMessageBox
from Baidu_Text_transAPI_2 import baidu_api_fanyi
with fitz.open(r'C:\Users\11255\Documents\方案5.27\SAE AMS 2437D-2019.pdf') as doc: # 使用fitz库打开PDF文档
num_pages = doc.page_count # 获取PDF文档的总页数
# print("文档共", num_pages, "页")
text = '' # 初始化文本变量
for i in range(0, num_pages): # 遍历每一页
page = doc.load_page(i) # 加载当前页
page_text = page.get_text() # 获取当前页的文本
text = text + page_text # 将当前页的文本添加到总文本中
with open("全部未处理的PDF文本.txt", "w", encoding="utf-8") as f:
f.write(text)
with open("全部未处理的PDF文本.txt", 'r', encoding='utf-8', errors='ignore') as file:
text = file.read()
# 使用split()将字符串分割成行
lines = text.split('\n')
# 使用列表推导式去除空行
cleaned_lines = [line for line in lines if line.strip()]
# 使用join()将处理后的行重新组合成一个字符串
'''一级标题'''
pattern = r'(\d\.)\n' # 匹配1个数字+.+换行符情况
text = re.sub(pattern, lambda match: match.group(1), text)
'''二级标题'''
pattern = r'(\d\.\d\s*)\n' # 匹配1个数字+.+换行符情况
text = re.sub(pattern, r'\1', text)
print("处理结束!!!")
pattern = r'\n([a-z].*)' # 小写字母开头则去除换行
text = re.sub(pattern, r'\1', text)
pattern = r'\s*([!.,?;:=].*)' # 标点符号开头则去除换行
text = re.sub(pattern, r'\1', text)
pattern = r'([,=])\s*' # 标点符号结尾则去除换行
text = re.sub(pattern, r'\1', text)
pattern = r'(\d+) (\d+)' # 数字之间空格
text = re.sub(pattern, r'\1\2', text)
pattern = r'Downloaded from.+?Page (\d+) of (\d+)' # 数字之间空格
text = re.sub(pattern, r'', text, flags=re.DOTALL)
# 使用split()将字符串分割成行
lines = text.split('\n')
# 使用列表推导式去除空行
cleaned_lines = [line for line in lines if line.strip()]
# 使用join()将处理后的行重新组合成一个字符串
text = '\n'.join(cleaned_lines)
# 打开文件以写入文本,如果文件不存在则创建它
with open('output.txt', 'w', encoding='utf-8') as file:
file.write(text)
def check_text_pattern(text):
# 情况1:匹配开头是1个数字+"."
pattern1 = r'^\d\. '
if re.match(pattern1, text):
return "情况1"
# 情况2:匹配大写字母+"."
pattern2 = r'^\d\.\d '
if re.match(pattern2, text):
return "情况2"
# 情况3:匹配开头是“(”+“数字”+“)”
pattern3 = r'^\d\.\d\.\d '
if re.match(pattern3, text):
return "情况3"
pattern4 = r'^\d\.\d\.\d\.\d ' # 匹配(1个小写字母)+换行符情况
if re.match(pattern4, text):
return "情况4"
# 否则返回情况6
return "情况5"
with open('output.txt', 'r', encoding='utf-8', errors='ignore') as file:
lines = file.readlines() # 读取所有行到列表中
# 创建Word文档
doc = docx.Document()
for line in lines:
line=line.replace('\n', '')
#print('----', line)
if check_text_pattern(line)=='情况1':
# 如果匹配成功,则去除前缀
pattern = r'^\d\. '
# 使用正则表达式替换匹配到的内容为空字符串
new_text = re.sub(pattern, '', line)
# 创建一个段落,并设置为1级标题
heading = doc.add_heading(level=1)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = True # 段中不分页
heading.paragraph_format.keep_with_next = True # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(line)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.color.rgb = RGBColor(0, 0, 0)
'''**********************'''
s = baidu_api_fanyi(new_text)
# 创建一个段落,并设置为1级标题
heading = doc.add_heading(level=1)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0.38)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = False # 段中不分页
heading.paragraph_format.keep_with_next = False # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(s)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.color.rgb = RGBColor(0, 0, 0)
elif check_text_pattern(line)=='情况2':
pattern = r'^\d\.\d '
new_text = re.sub(pattern, '', line)
# 创建一个段落,并设置为1级标题
heading = doc.add_heading(level=2)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = True # 段中不分页
heading.paragraph_format.keep_with_next = True # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(line)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.color.rgb = RGBColor(0, 0, 0)
'''**********************'''
s = baidu_api_fanyi(new_text)
# 创建一个段落,并设置为2级标题
heading = doc.add_heading(level=2)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0.38)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = False # 段中不分页
heading.paragraph_format.keep_with_next = False # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(s)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.color.rgb = RGBColor(0, 0, 0)
elif check_text_pattern(line) == '情况3':
pattern = r'^\d\.\d\.\d ' # 匹配(1个数字)
new_text = re.sub(pattern, '', line)
# 创建一个段落,并设置为1级标题
heading = doc.add_heading(level=3)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = True # 段中不分页
heading.paragraph_format.keep_with_next = True # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(line)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.color.rgb = RGBColor(0, 0, 0)
'''**********************'''
s = baidu_api_fanyi(new_text)
# 创建一个段落,并设置为3级标题
heading = doc.add_heading(level=3)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0.38)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = False # 段中不分页
heading.paragraph_format.keep_with_next = False # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(s)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.color.rgb = RGBColor(0, 0, 0)
elif check_text_pattern(line) == '情况4':
pattern = r'^\d\.\d\.\d\.\d ' # 匹配(1个小写字母)
new_text = re.sub(pattern, '', line)
# 创建一个段落,并设置为4级标题
heading = doc.add_heading(level=4)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = True # 段中不分页
heading.paragraph_format.keep_with_next = True # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(line)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.color.rgb = RGBColor(0, 0, 0)
run.font.italic = False
'''**********************'''
s = baidu_api_fanyi(new_text)
# 创建一个段落,并设置为4级标题
heading = doc.add_heading(level=4)
# 左缩进
heading.paragraph_format.left_indent = Cm(0)
# 右缩进
heading.paragraph_format.right_indent = Cm(0)
# 首行缩进
heading.paragraph_format.first_line_indent = Cm(0.38)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
heading.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
heading.paragraph_format.space_before = Pt(0)
# 段后间距
heading.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
heading.paragraph_format.keep_together = False # 段中不分页
heading.paragraph_format.keep_with_next = False # 与下段同页
heading.paragraph_format.page_break_before = False # 段前分页
heading.paragraph_format.widow_control = True # 孤行控制
run = heading.add_run() # 将提取的文本添加到段落中
run = heading.add_run(s)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = True
run.font.italic = False
run.font.color.rgb = RGBColor(0, 0, 0)
else:
new_text=line
p = doc.add_paragraph()
# 左缩进
p.paragraph_format.left_indent = Cm(0)
# 右缩进
p.paragraph_format.right_indent = Cm(0)
# 首行缩进
p.paragraph_format.first_line_indent = Cm(0)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
p.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
p.paragraph_format.space_before = Pt(0)
# 段后间距
p.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
p.paragraph_format.keep_together = False # 段中不分页
p.paragraph_format.keep_with_next = False # 与下段同页
p.paragraph_format.page_break_before = False # 段前分页
p.paragraph_format.widow_control = True # 孤行控制
run = p.add_run(line)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 设置加粗
run.font.bold = False
run.font.color.rgb = RGBColor(0, 0, 0)
'''**********************'''
s = baidu_api_fanyi(new_text)
p = doc.add_paragraph()
# 左缩进
p.paragraph_format.left_indent = Cm(0)
# 右缩进
p.paragraph_format.right_indent = Cm(0)
# 首行缩进
p.paragraph_format.first_line_indent = Cm(0)
# 行间距
# p1.paragraph_format.line_spacing = Pt(20) # 20 磅
p.paragraph_format.line_spacing = 1.0
# p.paragraph_format.line_spacing = 1.0 # 1.5倍行距 单倍行距 1.0
"当line_spacing设置为长度值时表示绝对距离,"
"设置为浮点数时表示行高的倍数"
# 段前间距
p.paragraph_format.space_before = Pt(0)
# 段后间距
p.paragraph_format.space_after = Pt(0)
# 设置段落内部文字在遇到需分页情况时处理状态
p.paragraph_format.keep_together = True # 段中不分页
p.paragraph_format.keep_with_next = False # 与下段同页
p.paragraph_format.page_break_before = False # 段前分页
p.paragraph_format.widow_control = True # 孤行控制
run = p.add_run(s)
# 设置Run对象的字体大小
run.font.size = Pt(10) # 设置为10磅
run.font.name = 'Times New Roman'
# 设置中文字体
run.font.element.rPr.rFonts.set(qn('w:eastAsia'), "宋体")
# 保存文档
doc.save('example.docx')
翻译pdf到word,变成双语,且具有标题分级,方便阅读,暂时没做pdf中表格的处理。可快速阅读英文。
最新推荐文章于 2024-11-12 17:43:42 发布