# -*- coding: utf-8 -*-
"""
Created on Wed Oct 21 09:30:07 2020
@author: 元白
"""
import camelot
import pandas
import time
import os
import glob
import re
import queue
import threading
import time
#from multiprocessing import Pool
import multiprocessing as mp
def func_pdf_find(pdf_path):
pdfs = glob.glob("{}/*.pdf".format(pdf_path))
return pdfs
def get_table_form_pdf(pdf):
#只从每个pdf的前6页中提取表格
tables = camelot.read_pdf(pdf, pages='1-6', flavor='stream')
return tables
def put_table_to_cvs(tables, pdf):
#从pdf文件名中,提取序号,非必须操作
nums = pdf.split(".pdf")[0]
y=float(nums.split("_")[1])
z=float(nums.split("_")[3])
#只提取前6个表格
for i in range(0,6,2):
df1 = tables[i].df.loc[0:9,1:10]
df2 = tables[i+1].df.loc[0:9,1:10]
df3 = pandas.merge(df2, df1, left_index=True, right_index=True)
df3.loc[-1] = ['angle',y,z,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
df3.index = df3.index + 1
df3 = df3.sort_index()
#debug 阶段先不开存储
df3.head(11).to_csv('dfdemo.csv', index=False, mode='a+', header=False)
def job(d, l, idx, pdf):
#耗时操作,从pdf提取表格,无所谓顺序,lock前操作
tables = get_table_form_pdf(pdf)
while True:
l.acquire()
if d[1] == idx:
print('pdf,d[1] / idx :',d[1], idx)
put_table_to_cvs(tables, pdf)
d[1] += 1
l.release()
break
l.release()
time.sleep(0.01)
return 1
if __name__ == '__main__':
with mp.Manager() as manager:
time_start = time.time()
pdfs = func_pdf_find('d:\python-demo')
pool = mp.Pool(8)
lock = manager.Lock()
d = manager.dict()
d[1] = 1
idx = 1
#print(pdfs)
for pdf in pdfs:
p_obj = pool.apply_async(job, args=(d,lock,idx,pdf)) # 异步执行进程
idx += 1
time.sleep(0.9)
pool.close() # 不再向进程池提交新的任务了
pool.join() # 进程池中的进程都执行完了
time_end = time.time()
print('used times : ',time_end - time_start)
python3 pdf 无边框 表格 进程池多进程 批量提取
最新推荐文章于 2024-07-03 17:06:59 发布