pdf,搜索相关路径下pdf文档,把pdf文档移动到指定的路径下,再对该路径下的pdf文档进行转换
缺点:无法转换带有图片的pdf文档,转换后的文档格式问题
安装相应的库
1)pip install pdfminer3k 是pdfminer的Python 3端口
2)安装docx库
pip install python_docx
使用了os.walk对特定路径下的pdf文档进行查找,并对该文档进行移动
import os
import shutil
import importlib
import sys
import re
def load_file():
walk = os.walk(r'C:\Users\ALFIEL\Desktop\20190527')
i=1
for root, dirs,files in walk:
print((root,dirs,files))
for name in files:
if len(re.findall('(\w)\.pdf',name))>=1:#添加条件对pdf文件进行筛选
shutil.move(os.path.join(root,name), "C:\\Users\\SALFIEL\\Desktop\\pdfdocement\\"+str(i)+".pdf")
i+=1
load_file()
转换完整的代码如下:
#pdf转换器
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter,process_pdf
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from docx import Document
from pdfminer.layout import *
document = Document()
import warnings
warnings.filterwarnings("ignore")
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from io import StringIO
from urllib.request import urlopen
import pandas as pd
import os
def readPDF(pdfFile):#读取pdf文件
#创建PDf 资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
retstr = StringIO()#数据保存到内存中
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, laparams=laparams)
process_pdf(rsrcmgr, device, pdfFile)
device.close()
content = retstr.getvalue()
retstr.close()
return content
def save_to_file(file_name, contents):#可以存储为相应的格式的文档,单纯以文字的pdf转换
fh = open(file_name, 'w',encoding='utf-8')#若没有encoding='utf-8'则会爆出gbk编码错误。写入txt,csv需要转换格式utf-8
fh.write(contents)
fh.close()
# save_to_file('mobiles.txt', 'your contents str')
def main():
#当某个目录下面寻找pdf文档文件并把它移动到某个路径下面
#进行txt或则csv的转换
walk=os.walk(r"C:\Users\ALFIEL\Desktop\pdfdocement")
for root, dirs,files in walk:
for name in files:
if len(re.findall('(\w)\.pdf',name))>=1:#添加条件对pdf文件进行筛选
pdfFile = open("{}".format(os.path.join(root,name)),'rb')
outputString = readPDF(pdfFile)
save_to_file('c.csv',outputString)
def save_to_doxc(file_name,path):
#以二进制读取打开文件
fn = open(file_name,'rb')
#用文件对象创建pdf文档分析器
parser = PDFParser(fn)
#创建一个pdf文档
doc = PDFDocument()
#连接分析器与文档对象
parser.set_document(doc)
doc.set_parser(parser)
# 提供初始化密码
# 如果没有密码 就创建一个空的字符串
doc.initialize()
# 检测文档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
resource = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(resource,laparams=laparams)
# 创建一个PDF解释器对象
interpreter = PDFPageInterpreter(resource,device)
num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
for i in doc.get_pages():#循环遍历列表,每次处理一个page的内容
num_page += 1
interpreter.process_page(i)
layout = device.get_result()
print(layout)
for x in layout:
if isinstance(x,LTImage):
num_image += 1
if isinstance(x,LTCurve):
num_curve += 1
if isinstance(x,LTFigure): # figure对象
num_figure += 1
if isinstance(x, LTTextBoxHorizontal):#判断变量时水平文本框对象
num_TextBoxHorizontal += 1
# 水平文本框对象增一
# 保存文本内容
if hasattr(x,"get_text"):#判断x对象是否有get_text方法
with open(r'{}'.format(path), 'a',encoding='utf-8') as f: #生成doc文件的文件名及路径
results = x.get_text()
print(results)
f.write(results)
f.write('\n')
# 获取文本内容
print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n'
%num_TextBoxHorizontal)
print ('处理完成')
def main2():
walk=os.walk(r"C:\Users\ALFIEL\Desktop\pdfdocement")
for root, dirs,files in walk:
for name in files:
if len(re.findall('(\w)\.pdf',name))>=1:#添加条件对pdf文件进行筛选
pdfFile = os.open(r"{}".format(os.path.join(root,name)),os.O_RDWR)
save_to_doxc(pdfFile,path=os.path.join(root,name).replace('.pdf','.doc'))
if __name__ == '__main__':
main2()
注意:当写入txt,csv文件时,需要吧编码模式encoding为utf-8
doc转pdf或其他
还有通过调用win32接口进行doc的转换
from win32com.client import Dispatch, constants
def doc2pdf(input1, output):
w = Dispatch('Word.Application')
try:
doc = w.Documents.Open(input1, ReadOnly=1)
doc.SaveAs(output, 17)
return True
except Exception as e:
print(e)
return False
finally:
doc.Close()
def main():
input1 = r'C:\Users\ALFIEL\Desktop\pdfdocement\1.doc'
output = r'C:\Users\ALFIEL\Desktop\pdfdocement\6.pdf'
rc = doc2html(input1, output)
if rc:
print('转换成功')
else:
print('转换失败')
if __name__ == '__main__':
main()
```其中SaveAs(output,wdformat)
wdfomat参数如下:
wdFormatDocument = 0
wdFormatDocument97= 0
wdFormatDocumentDefault = 16
wdFormatDOSText = 4
wdFormatDOSTextLineBreaks = 5
wdFormatEncodedText = 7
wdFormatFilteredHTML = 10
wdFormatFlatXML = 19
wdFormatFlatXMLMacroEnabled = 20
wdFormatFlatXMLTemplate = 21
wdFormatFlatXMLTemplateMacroEnabled = 22
wdFormatHTML = 8
wdFormatPDF = 17
wdFormatRTF = 6
wdFormatTemplate = 1
wdFormatTemplate97 = 1
wdFormatText = 2
wdFormatTextLineBreaks = 3
wdFormatUnicodeText = 7
wdFormatWebArchive = 9
wdFormatXML = 11
wdFormatXMLDocument = 12
wdFormatXMLDocumentMacroEnabled = 13
wdFormatXMLTemplate = 14
wdFormatXMLTemplateMacroEnabled = 15
wdFormatXPS = 18
照着字面意思应该能对应到相应的文件格式,如果你是office 2003可能支持不了这么多格式。word文件转html有两种格式可选wdFormatHTML、wdFormatFilteredHTML(对应数字8、10),区别是如果是wdFormatHTML格式的话,word文件里面的公式等ole对象将会存储成wmf格式,而选用wdFormatFilteredHTML的话公式图片将存储为gif格式,而且目测可以看出用wdFormatFilteredHTML生成的HTML明显比wdFormatHTML要干净许多。
原文链接:https://blog.csdn.net/weixin_41341221/article/details/100204128