# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import re
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from cStringIO import StringIO
#from io import StringIO for python3
from io import open
from pdfminer.pdfpage import PDFPage
def pdf_txt(url):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
f = requests.get(url).content
fp = StringIO(f)
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos = set()
for page in PDFPage.get_pages(fp,
pagenos,
maxpages=maxpages,
password=password,
caching=caching,
check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
txt=tpdf_txt('http://pythonscraping.com/pages/warandpeace/chapter1.pdf')
print txt
#如果pdf含有中文,输出到文件
#open('pdf.txt','wb').write(txt)
python转pdf到文本
最新推荐文章于 2024-06-03 18:16:38 发布