from
pdfminer.converter
import
PDFPageAggregator
from
pdfminer.layout
import
LAParams
from
pdfminer.pdfparser
import
PDFParser, PDFDocument
from
pdfminer.pdfinterp
import
PDFResourceManager, PDFPageInterpreter
from
pdfminer.pdfdevice
import
PDFDevice
from
urllib.request
import
urlopen
import
traceback
#Open a PDF file.
#rb是指以二进制读的形式打开
def
pdfread(file,mubiaofile):
fp
=
open(file,
"rb"
)
#fp=urlopen("http://www.tencent.com/zh-cn/articles/8003251479983154.pdf")
#Create a PDF parser object associated with the file object.
parser
=
PDFParser(fp)
#Create a PDF document object that stores the document structure
doc
=
PDFDocument()
#Connect the parser annd document objects.
parser.set_document(doc)
doc.set_parser(parser)
# Supply the password for initialization.
# (If no password is set, give an empty string.)
doc.initialize(
""
)
#创建PDF资源管理器
resource
=
PDFResourceManager()
#参数分析器
laparam
=
LAParams()
#创建一个聚合器
device
=
PDFPageAggregator(resource,laparams
=
laparam)
#创建PDF页面解释器
interpreter
=
PDFPageInterpreter(resource,device)
#使用文档对象从页面读取内容
for
page
in
doc.get_pages():
#使用页面解释器来读取
interpreter.process_page(page)
#使用聚合器来获取内容
layout
=
device.get_result()
for
out
in
layout:
if
hasattr(out,
"get_text"
):
with
open(mubiaofile,
'a'
)
as
f:
try
:
f.write(out.get_text()
+
'\n'
)
except
:
continue
#print(out.get_text())
print(
'ok'
)
if
__name__
==
'__main__'
:
a
=
"文件地址"
b
=
'输出文件名'
pdfread(a,b)