import PyPDF4
import pikepdf
import fitz
#对pdf文件进行简单的解密
def jiemi(pdfpath):
new_pdfpath = pdfpath[:-4] + '_new' + pdfpath[-4:]
fp = open(pdfpath, "rb+")
pdfFile = PyPDF4.pdf.PdfFileReader(fp)
# pdf 解密
if pdfFile.isEncrypted:
pdf = pikepdf.open(pdfpath, password='')
pdf.save(new_pdfpath)
return new_pdfpath
#将每一页转化为图片并保存
def pdf_image(pdf_name):
img_paths = []
pdf = fitz.Document(pdf_name)
for i,pg in enumerate(range(0, pdf.pageCount)):
page = pdf[pg] # 获得每一页的对象
trans = fitz.Matrix(3.0, 3.0).preRotate(0)
pm = page.getPixmap(matrix=trans, alpha=False) # 获得每一页的流对象
# pm.writePNG(dir_name + os.sep + base_name[:-4] + '_' + '{:0>3d}.png'.format(pg + 1)) # 保存图片
img_path = pdf_name[:-4] + '_' + str(pg+1) + '.jpg'
pm.writePNG(img_path) # 保存图片
img_paths.append(img_path)
pdf.close()
return img_paths
如果报错:
ModuleNotFoundError: No module named ‘frontend’
请参考链接: https://blog.csdn.net/xc_zhou/article/details/102596542
import os
import fitz
import json
# os.chdir("data")
# print(os.listdir())
from collections import namedtuple
TextUnit = namedtuple('TextUnit', ['x0', 'y0', 'x1', 'y1', 'span', 'page_no'])
base_path = os.path.dirname(os.path.abspath(__file__))
data_dir_path = os.path.join(base_path, "data")
os.chdir(data_dir_path)
tasks = {os.path.splitext(file)[0]:os.path.join(data_dir_path, file) for file in os.listdir()}
# 转 txt
def to_txt():
tasks_len = len(tasks)
for index, fname in enumerate(tasks):
print("{}-{}".format(index, tasks_len))
os.chdir(os.path.join(base_path, "txt2"))
dir_name = str(index) + "-"+fname
os.mkdir(dir_name)
os.chdir(os.path.join(base_path, "txt2", dir_name))
doc = fitz.open(tasks[fname])
size_limit=-1
nl_char='\n'
sep_char=' '
for page_no, page in enumerate(doc):
# if page_no != 3:
# continue
text_list = []
page_num = str(page_no)
page_len = len(page_num)
page_num = page_num if page_len >= 3 else "0"+page_num if page_len == 2 else "00"+page_num
new_line = True
json_obj = json.loads(page.getText('json'))
for block in json_obj.get('blocks', []):
if new_line and text_list:
text_list.append("\n")
for line in block.get('lines', []):
for span in line.get('spans', []):
text = span.get('text', '')
text_list.append(text)
x0, y0, x1, y1 = span['bbox']
new_line = text.endswith(' ')
new_page = False
out = open(fname + "-" + page_num + ".txt", "wb")
text = "".join(text_list).encode("utf8")
out.write(text)
out.close()
# 转图片
def to_img():
for index, fname in enumerate(tasks):
os.chdir(os.path.join(base_path, "img"))
dir_name = str(index) + "-"+fname
os.mkdir(dir_name)
os.chdir(os.path.join(base_path, "img", dir_name))
doc = fitz.open(tasks[fname])
for index, page in enumerate(doc):
pix = page.getPixmap(alpha=False)
pageimg_name = "page-%i.png" % page.number
pix.writePNG(pageimg_name)
# 截取
def to_delete():
for index, fname in enumerate(tasks):
os.chdir(os.path.join(base_path, "to_pdf"))
dir_name = str(index) + "-"+fname
os.mkdir(dir_name)
os.chdir(os.path.join(base_path, "to_pdf", dir_name))
doc = fitz.open(tasks[fname])
doc2 = fitz.open()
doc2.insertPDF(doc, to_page = 9) # first 10 pages
# doc2.insertPDF(doc1, from_page = len(doc1) - 10) # last 10 pages
doc2.save("first-and-last-10.pdf")
# for index, page in enumerate(doc):
# pix = page.getPixmap(alpha=False)
# pageimg_name = "page-%i.png" % page.number
# pix.writePNG(pageimg_name)