import os
import io
import requests
import warnings
import pdfplumber
def read_pdf(path,proxies={},timeout=(3.2,10),download_image=False):
f=''
if path=='' or type(path)!=str:
print("路径为空或格式不对!")
if path[0:4]=="http":
try:
#data=request.urlopen(path,timeout=10).read()
print(proxies)
data=requests.get(url=path,timeout=timeout,proxies=proxies)
f = io.BytesIO(data.content)
except Exception as e:
print(e,"打开链接失败")
return
else:
try:
path=urllib.parse.unquote(path)
path=path.replace('file:///','').replace('/','\\')
f=open(path,'rb')
except Exception as e:
print(e,"打开本地文件失败")
text=''
old_path=os.getcwd()
if download_image:
im_path=path.replace('https://','').replace("http://",'')
os.makedirs(im_path, exist_ok=True)
os.chdir(im_path)
with pdfplumber.open(f) as pdf:
# 遍历每个页面
for page in pdf.pages:
# 获取当前页面的全部文本信息,包括表格中的文字,没有内容则打印None
text+=page.extract_text()
if download_image:
images=page.images
i=0
for img in images:
f_img=open('{}.png'.format(i),'wb+')
f_img.write(img['stream'].get_data())
f_img.close()
i+=1
os.chdir(old_path)
f.close()
return text
#读取本地pdf或者读取url地址的pdf文件,返回文字,且pdf图片存储到文件夹里
path="E:\\1.pdf"
#path="http://aa.com/1.pdf"
print(read_pdf(path))