历史原因
去官网打印过题的同学都知道,每次下载别年的题都会刷新到主页,需要多次点击才能下载另一年的,打开的pdf界面也是在搜索页直接加载的。最关键的是他那个服务器,经常出问题,nginx总是返回403,所有先爬下来保存吧。
上图!
py代码,改一下48行科目代码,运行一下,ok
上代码
import requests
from bs4 import BeautifulSoup
import os
import time
import random,re
index = 'http://gsmis.nuaa.edu.cn/zsgl' # 网站根地址
interval = 10 # 爬取图片的间隔时间
firstDir = r'G:\存储文件\nanjing\pdf' # 总路径
classificationDict = {} # 存放网站分类子页面的信息
#通过选择器来获取页面项
data = {
'__EVENTTARGET':'',
'__EVENTARGUMENT':'',
'__LASTFOCUS':'',
'__VIEWSTATE':'/wEPDwULLTIxMDIxOTcxNzYPZBYCAgEPZBYGAgEPEA8WBh4NRGF0YVRleHRGaWVsZAUCbmQeDkRhdGFWYWx1ZUZpZWxkBQJuZB4LXyFEYXRhQm91bmRnZBAVBwQyMDE4BDIwMTcEMjAxNgQyMDE1BDIwMTQEMjAxMwQyMDEyFQcEMjAxOAQyMDE3BDIwMTYEMjAxNQQyMDE0BDIwMTMEMjAxMhQrAwdnZ2dnZ2dnFgECAWQCBQ8PFgIeBFRleHQFSOWNl+S6rOiIquepuuiIquWkqeWkp+WtpuW+gOW5tOWNmuWjq+OAgeehleWjq+eglOeptueUn+WIneivleivlemimOS4i+i9vWRkAgcPPCsACQEADxYEHghEYXRhS2V5cxYAHgtfIUl0ZW1Db3VudAJJZBaSAWYPZBYCZg8VAhJtczEwMjg3MjAxNzIxMS5wZGYVMjEx57+76K+R56GV5aOr6Iux6K+tZAIBD2QWAmYPFQISbXMxMDI4NzIwMTcyMTMucGRmFTIxM+e/u+ivkeehleWjq+aXpeivrWQCAg9kFgJmDxUCEm1zMTAyODcyMDE3MjE0LnBkZhUyMTTnv7vor5HnoZXlo6vms5Xor61kAgMPZBYCZg8VAhJtczEwMjg3MjAxNzI0Mi5wZGYJMjQy5rOV6K+tZAIED2QWAmYPFQISbXMxMDI4NzIwMTcyNDQucGRmFTI0NOaXpeivre+8iOWtpuehle+8iWQCBQ9kFgJmDxUCEm1zMTAyODcyMDE3MjQ2LnBkZgkyNDboi7Hor61kAgYPZBYCZg8VAhJtczEwMjg3MjAxNzMzMy5wZGYPMzMz5pWZ6IKy57u85ZCIZAIHD2QWAmYPFQISbXMxMDI4NzIwMTczNTcucGRmFTM1N+iLseivree/u+ivkeWfuuehgGQCCA9kFgJmDxUCEm1zMTAyODcyMDE3MzU5LnBkZhUzNTnml6Xor63nv7vor5Hln7rnoYBkAgkPZBYCZg8VAhJtczEwMjg3MjAxNzQzMS5wZGYSNDMx6YeR6J6N5a2m57u85ZCIZAIKD2QWAmYPFQISbXMxMDI4NzIwMTc0NDgucGRmHjQ0OOaxieivreWGmeS9nOS4jueZvuenkeefpeivhmQCCw9kFgJmDxUCEm1zMTAyODcyMDE3NjAxLnBkZg82MDHmlbDlrabliIbmnpBkAgwPZBYCZg8VAhJtczEwMjg3MjAxNzYxNy5wZGYSNjE36K6+6K6h5a2m5qaC6K66ZAIND2QWAmYPFQISbXMxMDI4NzIwMTc2MTgucGRmDzYxOOmHj+WtkOWKm+WtpmQCDg9kFgJmDxUCEm1zMTAyODcyMDE3NjE5LnBkZiQ2MTnpqazlhYvmgJ3kuLvkuYnln7rmnKzljp/nkIbmpoLorrpkAg8PZBYCZg8VAhJtczEwMjg3MjAxNzYyMC5wZGYPNjIw5Z+656GA6Iux6K+tZAIQD2QWAmYPFQISbXMxMDI4NzIwMTc2MjEucGRmEjYyMeiJuuacr+WtpuamguiuumQCEQ9kFgJmDxUCEm1zMTAyODcyMDE3NjIyLnBkZg82MjLnvo7mnK/mpoLorrpkAhIPZBYCZg8VAhJtczEwMjg3MjAxNzYyMy5wZGYSNjIz56S+5Lya5a2m5Y6f55CGZAITD2QWAmYPFQISbXMxMDI4NzIwMTc2MjQucGRmDzYyNOWfuuehgOaXpeivrWQCFA9kFgJmDxUCEm1zMTAyODcyMDE3NjI1LnBkZg82MjXlhazlhbHnrqHnkIZkAhUPZBYCZg8VAhJtczEwMjg3MjAxNzYyNi5wZGYPNjI25pyJ5py65YyW5a2mZAIWD2QWAmYPFQISbXMxMDI4NzIwMTc2MjcucGRmEjYyN+aUv+ayu+WtpuWOn+eQhmQCFw9kFgJmDxUCEm1zMTAyODcyMDE3NjI5LnBkZgw2Mjnms5XnkIblraZkAhgPZBYCZg8VAhJtczEwMjg3MjAxNzYzMC5wZGYPNjMw54mp55CG5YyW5a2mZAIZD2QWAmYPFQISbXMxMDI4NzIwMTc2MzEucGRmITYzMeaVmeiCsuWtpuW/g+eQhuWtpuWfuuehgOe7vOWQiGQCGg9kFgJmDxUCEm1zMTAyODcyMDE3NjM3LnBkZhs2Mzfkv6Hmga/otYTmupDnrqHnkIbln7rnoYBkAhsPZBYCZg8VAhJtczEwMjg3MjAxNzgxMS5wZGYPODEx5pmu6YCa54mp55CGZAIcD2QWAmYPFQISbXMxMDI4NzIwMTc4MTMucGRmDzgxM+aXoOacuuWMluWtpmQCHQ9kFgJmDxUCEm1zMTAyODcyMDE3ODE0LnBkZg84MTTpq5jnrYnku6PmlbBkAh4PZBYCZg8VAhJtczEwMjg3MjAxNzgxNS5wZGYPODE155CG6K665Yqb5a2mZAIfD2QWAmYPFQISbXMxMDI4NzIwMTc4MTYucGRmDzgxNuadkOaWmeWKm+WtpmQCIA9kFgJmDxUCEm1zMTAyODcyMDE3ODE3LnBkZhI4MTflt6XnqIvng63lipvlraZkAiEPZBYCZg8VAhJtczEwMjg3MjAxNzgxOC5wZGYVODE45p2Q5paZ56eR5a2m5Z+656GAZAIiD2QWAmYPFQISbXMxMDI4NzIwMTc4MTkucGRmCTgxOeeUtei3r2QCIw9kFgJmDxUCEm1zMTAyODcyMDE3ODIwLnBkZhU4MjDoh6rliqjmjqfliLbljp/nkIZkAiQPZBYCZg8VAhJtczEwMjg3MjAxNzgyMS5wZGYkODIx5L+h5Y+357O757uf5LiO5pWw5a2X5L+h5Y+35aSE55CGZAIlD2QWAmYPFQISbXMxMDI4NzIwMTc4MjMucGRmEjgyM+eUteW3peeUteWtkOWtpmQCJg9kFgJmDxUCEm1zMTAyODcyMDE3ODI0LnBkZgw4MjTov5DnrbnlraZkAicPZBYCZg8VAhJtczEwMjg3MjAxNzgyNi5wZGYSODI25bel56iL57uP5rWO5a2mZAIoD2QWAmYPFQISbXMxMDI4NzIwMTc4MjcucGRmDDgyN+e7j+a1juWtpmQCKQ9kFgJmDxUCEm1zMTAyODcyMDE3ODI5LnBkZhg4MjnorqHnrpfmnLrkuJPkuJrln7rnoYBkAioPZBYCZg8VAhJtczEwMjg3MjAxNzgzMC5wZGYSODMw6YeR5bGe5p2Q5paZ5a2mZAIrD2QWAmYPFQISbXMxMDI4NzIwMTc4MzEucGRmGzgzMeW3peeoi+e7k+aehOiuvuiuoeWOn+eQhmQCLA9kFgJmDxUCEm1zMTAyODcyMDE3ODMyLnBkZg84MzLkuqflk4Horr7orqFkAi0PZBYCZg8VAhJtczEwMjg3MjAxNzgzNC5wZGYbODM05pWw5o2u5bqT5Y6f55CG5Y+K5bqU55SoZAIuD2QWAmYPFQISbXMxMDI4NzIwMTc4MzYucGRmEjgzNueuoeeQhuWtpuWOn+eQhmQCLw9kFgJmDxUCEm1zMTAyODcyMDE3ODM4LnBkZg84Mzjkv6Hmga/mo4DntKJkAjAPZBYCZg8VAhJtczEwMjg3MjAxNzg0MC5wZGYPODQw6YOo6Zeo5rOV5a2mZAIxD2QWAmYPFQISbXMxMDI4NzIwMTc4NDIucGRmHjg0Mue/u+ivkeS4juWGmeS9nO+8iOiLseivre+8iWQCMg9kFgJmDxUCEm1zMTAyODcyMDE3ODQzLnBkZh44NDPnvo7mnK/nkIborrrkuI7kuJPkuJrmioDms5VkAjMPZBYCZg8VAhJtczEwMjg3MjAxNzg0NS5wZGYeODQ157+76K+R5LiO5YaZ5L2c77yI5pel6K+t77yJZAI0D2QWAmYPFQISbXMxMDI4NzIwMTc4NTAucGRmFTg1MOekvuS8mueglOeptuaWueazlWQCNQ9kFgJmDxUCEm1zMTAyODcyMDE3ODUyLnBkZiQ4NTLpn7PkuZDoiJ7ouYjlj7LorrrkuI7kvZzlk4HliIbmnpBkAjYPZBYCZg8VAhJtczEwMjg3MjAxNzg1My5wZGYPODUz5LiT5Lia5oqA5rOVZAI3D2QWAmYPFQISbXMxMDI4NzIwMTc4NjAucGRmFTg2MOmBk+i3r+W3peeoi+adkOaWmWQCOA9kFgJmDxUCEm1zMTAyODcyMDE3ODYxLnBkZhg4NjHopb/mlrnmlL/msrvmgJ3mg7Plj7JkAjkPZBYCZg8VAhJtczEwMjg3MjAxNzg2My5wZGYSODYz5YWs5YWx566h55CG5a2mZAI6D2QWAmYPFQISbXMxMDI4NzIwMTc4NjYucGRmGzg2NuW5v+aSreeUteinhuiJuuacr+eQhuiuumQCOw9kFgJmDxUCEm1zMTAyODcyMDE3ODY3LnBkZhU4NjfmiI/liaflj7Lorrrnu7zlkIhkAjwPZBYCZg8VAhJtczEwMjg3MjAxNzg2OC5wZGYYODY455S156a76L6Q5bCE5o6i5rWL5a2mZAI9D2QWAmYPFQISbXMxMDI4NzIwMTc4NzQucGRmHjg3NOaAneaDs+aUv+ayu+aVmeiCsuWtpuWOn+eQhmQCPg9kFgJmDxUCEm1zMTAyODcyMDE3ODc2LnBkZhU4NzbmoLjovpDlsITniannkIblraZkAj8PZBYCZg8VAhJtczEwMjg3MjAxNzg3Ny5wZGYVODc355Sf54mp5Yy75a2m5YyW5a2mZAJAD2QWAmYPFQISbXMxMDI4NzIwMTc4NzgucGRmITg3OOaVsOWtl+eUtei3r+WSjOS/oeWPt+S4juezu+e7n2QCQQ9kFgJmDxUCEm1zMTAyODcyMDE3OTE2LnBkZh05MTbmnZDmlpnlipvlraYo5LiT5Lia5a2m5L2NKWQCQg9kFgJmDxUCEm1zMTAyODcyMDE3OTE3LnBkZiA5MTflt6XnqIvng63lipvlraYo5LiT5Lia5a2m5L2NKWQCQw9kFgJmDxUCEm1zMTAyODcyMDE3OTE5LnBkZhc5MTnnlLXot68o5LiT5Lia5a2m5L2NKWQCRA9kFgJmDxUCEm1zMTAyODcyMDE3OTIwLnBkZiM5MjDoh6rliqjmjqfliLbljp/nkIYo5LiT5Lia5a2m5L2NKWQCRQ9kFgJmDxUCEm1zMTAyODcyMDE3OTIyLnBkZiw5MjLmlbDmja7nu5PmnoTkuI7mk43kvZzns7vnu58o5LiT5Lia5a2m5L2NKWQCRg9kFgJmDxUCEm1zMTAyODcyMDE3OTM2LnBkZiA5MzbnrqHnkIblrabljp/nkIYo5LiT5Lia5a2m5L2NKWQCRw9kFgJmDxUCEm1zMTAyODcyMDE3OTM3LnBkZhI5MzfmlZnogrLnrqHnkIblraZkAkgPZBYCZg8VAhJtczEwMjg3MjAxNzk3OC5wZGYdOTc45pWw5a2X55S16LevKOS4k+S4muWtpuS9jSlkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQZJbWFnZTOGRmO8JF+yBmXX8s2DXKQaTU+SSw==',
'__EVENTVALIDATION':'/wEWCgL5uOy1DwLQ0pPeBAKTxZioCwKTxczACQKTxeCnAQKTxZSLCgKTxYjuAwKTxbzVBAKTxdC4DALf2fKGA7hpA7V7GdWD3vDqtJd9XRg7gqSz',
'drpnd':2015,
'Image3.x':32,
'Image3.y':14
}
date = [2018,2017,2016,2015,2014,2013,2012,2011]
def screen(url, data ,select):
html = requests.post(url = url, data = data,headers = {'Cookie':'ASP.NET_SessionId=roe4n35500abyp45nfqb0f55'}) # 随机获取一个headers
html.encoding = 'gbk'
html = html.text
soup = BeautifulSoup(html, 'lxml')
return soup.select(select)
# sc = screen('http://gsmis.nuaa.edu.cn/zsgl/zsmlgl/sjcx_ss.aspx','table#DataList1 tr td a')
# print(sc)
hrefs = []
def get_urls(date):
for date_tmp in date:
data['drpnd'] = date_tmp
sc = screen('http://gsmis.nuaa.edu.cn/zsgl/zsmlgl/sjcx_ss.aspx',data,'table#DataList1 tr td a')
for tmp in sc:
href = tmp.get('href')
# print(href)
href = href.split(".")
href = href[2]
num = href.split("/")
num = num[2][11:]
if num == '820':#此处填写3位课程代码
href = index + href + '.pdf'
hrefs.append(href)
time.sleep(5)
print(hrefs)
def dowload(url,name):
r = requests.get(url)
filename = firstDir + '/' + str(name) + '.pdf'
filename = filename.replace('\\','/')
try:
with open(filename,'wb') as f:
f.write(r.content)
except Exception as e:
os.makedirs(firstDir)
print(e)
get_urls(date)
num = 0
for href in hrefs:
time.sleep(3)
print(num)
dowload(href,date[num])
num += 1