import os
import fitz
import json
# os.chdir("data")
# print(os.listdir())
from collections import namedtuple
TextUnit = namedtuple('TextUnit', ['x0', 'y0', 'x1', 'y1', 'span', 'page_no'])
base_path = os.path.dirname(os.path.abspath(__file__))
data_dir_path = os.path.join(base_path, "data")
os.chdir(data_dir_path)
tasks = {os.path.splitext(file)[0]:os.path.join(data_dir_path, file) for file in os.listdir()}
# 转 txt
def to_txt():
tasks_len = len(tasks)
for index, fname in enumerate(tasks):
print("{}-{}".format(index, tasks_len))
os.chdir(os.path.join(base_path, "txt2"))
dir_name = str(index) + "-"+fname
os.mkdir(dir_name)
os.chdir(os.path.join(base_path, "txt2", dir_name))
doc = fitz.open(tasks[fname])
size_limit=-1
nl_char='\n'
sep_char=' '
for page_no, page in enumerate(doc):
# if page_no != 3:
# continue
text_list = []
page_num = str(page_no)
page_len = len(page_num)
page_num = page_num if page_len >= 3 else "0"+page_num if page_len == 2 else "00"+page_num
new_line = True
json_obj = json.loads(page.getText('json'))
for block in json_obj.get('blocks', []):
if new_line and text_list:
text_list.append("\n")
for line in block.get('lines', []):
for span in line.get('spans', []):
text = span.get('text', '')
text_list.append(text)
x0, y0, x1, y1 = span['bbox']
new_line = text.endswith(' ')
new_page = False
out = open(fname + "-" + page_num + ".txt", "wb")
text = "".join(text_list).encode("utf8")
out.write(text)
out.close()
# 转图片
def to_img():
for index, fname in enumerate(tasks):
os.chdir(os.path.join(base_path, "img"))
dir_name = str(index) + "-"+fname
os.mkdir(dir_name)
os.chdir(os.path.join(base_path, "img", dir_name))
doc = fitz.open(tasks[fname])
for index, page in enumerate(doc):
pix = page.getPixmap(alpha=False)
pageimg_name = "page-%i.png" % page.number
pix.writePNG(pageimg_name)
# 截取
def to_delete():
for index, fname in enumerate(tasks):
os.chdir(os.path.join(base_path, "to_pdf"))
dir_name = str(index) + "-"+fname
os.mkdir(dir_name)
os.chdir(os.path.join(base_path, "to_pdf", dir_name))
doc = fitz.open(tasks[fname])
doc2 = fitz.open()
doc2.insertPDF(doc, to_page = 9) # first 10 pages
# doc2.insertPDF(doc1, from_page = len(doc1) - 10) # last 10 pages
doc2.save("first-and-last-10.pdf")
# for index, page in enumerate(doc):
# pix = page.getPixmap(alpha=False)
# pageimg_name = "page-%i.png" % page.number
# pix.writePNG(pageimg_name)
python pdf 转图片 转txt 信息抽取 表格提取
最新推荐文章于 2024-08-19 03:11:16 发布