说明
支持原创,支持开放源码!
声明
本文原创,转载请友情链接本文超链接或者地址:本文链接
环境
本代码使用环境是:Anaconda2(python2.7)、Windows(win10)、编辑工具使用的是Pycharm
#代码
# encoding:utf-8
from bs4 import BeautifulSoup
import urllib
import re
import urlparse
import requests
import time
class HtmlParder_pdf(object):
# 容器
def __init__(self):
self.url_list = set()
self.filenames = set()
# 解析器
def parser(self, url):
response = urllib.urlopen(url)
html_cont = response.read()
soup = BeautifulSoup(html_cont, "html.parser", from_encoding='utf-8')
return soup
# 找出每一页的有效链接
def find_file_in_one_page(self, soup):
pdf_url_in_One = soup.find_all('a', href=re.compile(r"/data/."))
return pdf_url_in_One
# 得到下一页的链接地址
def get_next_page(self, soup):
next_page_url = soup.find('a', id="next", href=re.compile(r'/website/xxzx/gkxxpl/gsjbxx/grbxtk/rsbx/.'))
print next_page_url
next_page_link = urlparse.urljoin(root_url, next_page_url['href'])
return next_page_link
#为set添加,写出到文件中使用,本pro旨在下载以及中文转义问题,没有使用这个方法提取文件名
def add_filenames(self, pdf_url_in_One):
filename_list = self.filenames
for link in pdf_url_in_One:
pdf_url_in_One_link = urlparse.urljoin(root_url, link['href'])
filename = pdf_url_in_One_link.split("/")[-1]
filename_list.add(filename)
# 添加url队列
def add_url_list(self, url):
for link in url:
pdf_url_in_One_link = urlparse.urljoin(root_url, link['href'])
self.url_list.add(pdf_url_in_One_link)
# 6##########xxxxxxxxxxxxxxxxxx
# 下载器
"""
由于直接获取的连接中有中文存在,导致 运行错误;
通过抓包工具找到解决办法:
使用request的GET method获取直接的连接之后进行获取header,输出request地址中的文件连接之后,使用这个连接进行下载;
"""
def download(self):
# donwload 1 pdf and pop that url
url = self.url_list.pop().encode("utf-8")
request = requests.request("GET", url)
new_url = request.url
if url.split("/")[-1].split(".")[-1].decode("utf-8") == "pdf":
urllib.urlretrieve(new_url, url.split("/")[-1].decode("utf-8"))
# print os._exists(url.split("/")[-1].decode("utf-8"))
request.close()
# 3
def hasnext_page(self, url):
if url != None:
return True
return False
if __name__ == '__main__':
#第一页,下载页起始页面
root_url = "http://www.aviva-cofco.com.cn/website/xxzx/gkxxpl/gsjbxx/grbxtk/rsbx/list-1.shtml"
count = 1
obj = HtmlParder_pdf()
while obj.hasnext_page(root_url):
soup = obj.parser(root_url)
old_root_url = None
# print soup
file_list = obj.find_file_in_one_page(soup)
# print file_list
obj.add_url_list(file_list)
obj.add_filenames(file_list)
for i in range(0, list(obj.url_list).__len__()):
obj.download()
time.sleep(2)
print 'Crawing page', count, 'url', i + 1
next_url = obj.get_next_page(soup)
if next_url == root_url:
break
root_url = next_url
count = count + 1
print count
这段代码是经过一部分封装之后,进行测试并修改过bug,本来是帮一个Python交流群的人查看代码,结果发现代码混乱,只能看懂大概的用途;代码主题中解决的问题处有简单备注;最终下载的文件是*.pdf
如有错误之处,请不吝赐教~