要查看PDF内容,看的眼睛痛,那就编程吧。
import xlwings as xw
import glob
import pdfplumber
import re
import multiprocessing
def readPdf(path):
with pdfplumber.open(path) as pdf:
content = ''
# len(pdf.pages)为PDF文档页数
for i in range(len(pdf.pages)):
# pdf.pages[i] 是读取PDF文档第i+1页
page = pdf.pages[i]
# page.extract_text()函数即读取文本内容,下面这步是去掉文档最下面的页码
page_content = '\n'.join(page.extract_text().split('\n')[:-1])
content = content + page_content
return content
def pare_pdf(content):
pattern = re.compile('.*?润滑状态:(.*?)。.*?染与磨损状态:(.*?)。.*?议措施:(.*?)。.*?', re.S) # 用于匹配至少一个数字
s1 = pattern.findall(content)
data={
#剔除出" \n" s2= re.sub(" \n","",s1[0][1])
0:re.sub(" \n","",s1[0][0]),
1:re.sub(" \n","",s1[0][1]),
2:re.sub(" \n","",s1[0][2]),
}
return data
def pdf_main(path,msg):
print(path)
content = readPdf(path)
#解析内容
data=pare_pdf(content)
print(multiprocessing.current_process().name + '-' + msg)
return data
def excel_write(result):
app = xw.App(visible=False, add_book=False)
wb = app.books.open('pdftoexcel.xlsm')
wb.sheets["Sheet1"].clear()
for i,res in enumerate(results):
res_value=res.get()
table1=[res_value[0],res_value[1],res_value[2]]
wb.sheets["Sheet1"].range("B"+str(i+1)).value=table1#可同时写BCD三列
print(res_value)
wb.save()
wb.close()
app.quit()
if __name__ == "__main__":
#希望获得所有 pdf 文件的路径。用glob,一条命令就能完成这个功能。
pdf_path = "G:\风机油品检测\\2019年 华能会理四期风电场油品检测报告\华能会理四期风电场油品检测报告\\1 主齿"
pdfs = glob.glob("{}/*.pdf".format(pdf_path))
cpus = multiprocessing.cpu_count()
# 定义进程数量,设置为cpus核心的4倍
p = multiprocessing.Pool(cpus * 4)
results = []
for i,path in enumerate(pdfs):
msg = "hello %d" %(i)
results.append(p.apply_async(pdf_main, args=(path, msg)))#多参数
p.close()
p.join()
print("Sub-process(es) done.")
excel_write(results)
print("excel done.")
更多关于多线程和多进程 请看
https://www.cnblogs.com/whatisfantasy/p/6440585.html