python 读取在线pdf文档_python3 在线读取pdf

#!/usr/bin/python

# -*- coding: utf-8 -*-

"""

@Time : 2018/11/19

@Author : cxf

@Software: PyCharm

@desc : requests pdf

"""

import random

import urllib.request

from io import StringIO

from pdfminer.pdfinterp import PDFResourceManager, process_pdf

from pdfminer.converter import TextConverter

from pdfminer.layout import LAParams

class RTP(object):

def __init__(self):

self.user_agent = ['Mozilla/5.0 (Windows NT 10.0; WOW64)', 'Mozilla/5.0 (Windows NT 6.3; WOW64)',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0',

'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',

'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',

'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.95 Safari/537.36',

'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; rv:11.0) like Gecko)',

'Mozilla/5.0 (Windows; U; Windows NT 5.2) Gecko/2008070208 Firefox/3.0.1',

'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070309 Firefox/2.0.0.3',

'Mozilla/5.0 (Windows; U; Windows NT 5.1) Gecko/20070803 Firefox/1.5.0.12',

'Opera/9.27 (Windows NT 5.2; U; zh-cn)',

'Mozilla/5.0 (Macintosh; PPC Mac OS X; U; en) Opera 8.0',

'Opera/8.0 (Macintosh; PPC Mac OS X; U; en)',

'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.12) Gecko/20080219 Firefox/2.0.0.12 Navigator/9.0.0.6',

'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Win64; x64; Trident/4.0)',

'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0)',

'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E)',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Maxthon/4.0.6.2000 Chrome/26.0.1410.43 Safari/537.1 ',

'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET4.0C; .NET4.0E; QQBrowser/7.3.9825.400)',

'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0 ',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.92 Safari/537.1 LBBROWSER',

'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0; BIDUBrowser 2.x)',

'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/3.0 Safari/536.11']

def to_requests(self, url, proxy=False):

# 使用代理

proxy_support = urllib.request.ProxyHandler({'https': '127.0.0.1:1080'})

opener = urllib.request.build_opener(proxy_support)

headers = {'User-Agent': random.choice(self.user_agent)}

req = urllib.request.Request(url, headers=headers)

if proxy:

response = opener.open(req)

else:

response = urllib.request.urlopen(req)

return response

# 本地读取pdf

def to_pdf(self, pdf_file_path):

if pdf_file_path:

with open(pdf_file_path, 'rb') as f:

response = open(pdf_file_path, 'rb')

content = self.read_to_pdf(response)

return content

else:

return None

def read_to_pdf(self, pdf_file):

if pdf_file:

rsrcmgr = PDFResourceManager()

retstr = StringIO()

laparams = LAParams()

device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams)

process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file)

device.close()

content = retstr.getvalue()

retstr.close()

return content

else:

return None

if __name__ == '__main__':

mains = RTP()

# 在线读取pdf

# url = "https://www.osc.state.ny.us/localgov/pubs/townoff/ito.pdf"

# url = "https://www.azasrs.gov/sites/default/files/2015Q2%20YOUR%20RETIREMENT%20NEWSLETTER%20FINAL.pdf"

url = "http://www.gxepb.gov.cn/zdxxgk/1504213198958806.pdf"

content = mains.to_requests(url) # 如果要使用代理 传入 proxy=True

result = mains.read_to_pdf(content)

# 本地读取pdf

# path = './2015.pdf'

# result = mains.to_pdf(path)

print(result)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值