爬南航研究生院题库

历史原因

去官网打印过题的同学都知道,每次下载别年的题都会刷新到主页,需要多次点击才能下载另一年的,打开的pdf界面也是在搜索页直接加载的。最关键的是他那个服务器,经常出问题,nginx总是返回403,所有先爬下来保存吧。

上图!

在这里插入图片描述

py代码,改一下48行科目代码,运行一下,ok

上代码

import requests
from bs4 import BeautifulSoup
import os
import time
import random,re


index = 'http://gsmis.nuaa.edu.cn/zsgl' # 网站根地址
interval = 10 # 爬取图片的间隔时间
firstDir = r'G:\存储文件\nanjing\pdf' # 总路径
classificationDict = {} # 存放网站分类子页面的信息
#通过选择器来获取页面项
data = {
    '__EVENTTARGET':'',
    '__EVENTARGUMENT':'',
    '__LASTFOCUS':'',
    '__VIEWSTATE':'/wEPDwULLTIxMDIxOTcxNzYPZBYCAgEPZBYGAgEPEA8WBh4NRGF0YVRleHRGaWVsZAUCbmQeDkRhdGFWYWx1ZUZpZWxkBQJuZB4LXyFEYXRhQm91bmRnZBAVBwQyMDE4BDIwMTcEMjAxNgQyMDE1BDIwMTQEMjAxMwQyMDEyFQcEMjAxOAQyMDE3BDIwMTYEMjAxNQQyMDE0BDIwMTMEMjAxMhQrAwdnZ2dnZ2dnFgECAWQCBQ8PFgIeBFRleHQFSOWNl+S6rOiIquepuuiIquWkqeWkp+WtpuW+gOW5tOWNmuWjq+OAgeehleWjq+eglOeptueUn+WIneivleivlemimOS4i+i9vWRkAgcPPCsACQEADxYEHghEYXRhS2V5cxYAHgtfIUl0ZW1Db3VudAJJZBaSAWYPZBYCZg8VAhJtczEwMjg3MjAxNzIxMS5wZGYVMjEx57+76K+R56GV5aOr6Iux6K+tZAIBD2QWAmYPFQISbXMxMDI4NzIwMTcyMTMucGRmFTIxM+e/u+ivkeehleWjq+aXpeivrWQCAg9kFgJmDxUCEm1zMTAyODcyMDE3MjE0LnBkZhUyMTTnv7vor5HnoZXlo6vms5Xor61kAgMPZBYCZg8VAhJtczEwMjg3MjAxNzI0Mi5wZGYJMjQy5rOV6K+tZAIED2QWAmYPFQISbXMxMDI4NzIwMTcyNDQucGRmFTI0NOaXpeivre+8iOWtpuehle+8iWQCBQ9kFgJmDxUCEm1zMTAyODcyMDE3MjQ2LnBkZgkyNDboi7Hor61kAgYPZBYCZg8VAhJtczEwMjg3MjAxNzMzMy5wZGYPMzMz5pWZ6IKy57u85ZCIZAIHD2QWAmYPFQISbXMxMDI4NzIwMTczNTcucGRmFTM1N+iLseivree/u+ivkeWfuuehgGQCCA9kFgJmDxUCEm1zMTAyODcyMDE3MzU5LnBkZhUzNTnml6Xor63nv7vor5Hln7rnoYBkAgkPZBYCZg8VAhJtczEwMjg3MjAxNzQzMS5wZGYSNDMx6YeR6J6N5a2m57u85ZCIZAIKD2QWAmYPFQISbXMxMDI4NzIwMTc0NDgucGRmHjQ0OOaxieivreWGmeS9nOS4jueZvuenkeefpeivhmQCCw9kFgJmDxUCEm1zMTAyODcyMDE3NjAxLnBkZg82MDHmlbDlrabliIbmnpBkAgwPZBYCZg8VAhJtczEwMjg3MjAxNzYxNy5wZGYSNjE36K6+6K6h5a2m5qaC6K66ZAIND2QWAmYPFQISbXMxMDI4NzIwMTc2MTgucGRmDzYxOOmHj+WtkOWKm+WtpmQCDg9kFgJmDxUCEm1zMTAyODcyMDE3NjE5LnBkZiQ2MTnpqazlhYvmgJ3kuLvkuYnln7rmnKzljp/nkIbmpoLorrpkAg8PZBYCZg8VAhJtczEwMjg3MjAxNzYyMC5wZGYPNjIw5Z+656GA6Iux6K+tZAIQD2QWAmYPFQISbXMxMDI4NzIwMTc2MjEucGRmEjYyMeiJuuacr+WtpuamguiuumQCEQ9kFgJmDxUCEm1zMTAyODcyMDE3NjIyLnBkZg82MjLnvo7mnK/mpoLorrpkAhIPZBYCZg8VAhJtczEwMjg3MjAxNzYyMy5wZGYSNjIz56S+5Lya5a2m5Y6f55CGZAITD2QWAmYPFQISbXMxMDI4NzIwMTc2MjQucGRmDzYyNOWfuuehgOaXpeivrWQCFA9kFgJmDxUCEm1zMTAyODcyMDE3NjI1LnBkZg82MjXlhazlhbHnrqHnkIZkAhUPZBYCZg8VAhJtczEwMjg3MjAxNzYyNi5wZGYPNjI25pyJ5py65YyW5a2mZAIWD2QWAmYPFQISbXMxMDI4NzIwMTc2MjcucGRmEjYyN+aUv+ayu+WtpuWOn+eQhmQCFw9kFgJmDxUCEm1zMTAyODcyMDE3NjI5LnBkZgw2Mjnms5XnkIblraZkAhgPZBYCZg8VAhJtczEwMjg3MjAxNzYzMC5wZGYPNjMw54mp55CG5YyW5a2mZAIZD2QWAmYPFQISbXMxMDI4NzIwMTc2MzEucGRmITYzMeaVmeiCsuWtpuW/g+eQhuWtpuWfuuehgOe7vOWQiGQCGg9kFgJmDxUCEm1zMTAyODcyMDE3NjM3LnBkZhs2Mzfkv6Hmga/otYTmupDnrqHnkIbln7rnoYBkAhsPZBYCZg8VAhJtczEwMjg3MjAxNzgxMS5wZGYPODEx5pmu6YCa54mp55CGZAIcD2QWAmYPFQISbXMxMDI4NzIwMTc4MTMucGRmDzgxM+aXoOacuuWMluWtpmQCHQ9kFgJmDxUCEm1zMTAyODcyMDE3ODE0LnBkZg84MTTpq5jnrYnku6PmlbBkAh4PZBYCZg8VAhJtczEwMjg3MjAxNzgxNS5wZGYPODE155CG6K665Yqb5a2mZAIfD2QWAmYPFQISbXMxMDI4NzIwMTc4MTYucGRmDzgxNuadkOaWmeWKm+WtpmQCIA9kFgJmDxUCEm1zMTAyODcyMDE3ODE3LnBkZhI4MTflt6XnqIvng63lipvlraZkAiEPZBYCZg8VAhJtczEwMjg3MjAxNzgxOC5wZGYVODE45p2Q5paZ56eR5a2m5Z+656GAZAIiD2QWAmYPFQISbXMxMDI4NzIwMTc4MTkucGRmCTgxOeeUtei3r2QCIw9kFgJmDxUCEm1zMTAyODcyMDE3ODIwLnBkZhU4MjDoh6rliqjmjqfliLbljp/nkIZkAiQPZBYCZg8VAhJtczEwMjg3MjAxNzgyMS5wZGYkODIx5L+h5Y+357O757uf5LiO5pWw5a2X5L+h5Y+35aSE55CGZAIlD2QWAmYPFQISbXMxMDI4NzIwMTc4MjMucGRmEjgyM+eUteW3peeUteWtkOWtpmQCJg9kFgJmDxUCEm1zMTAyODcyMDE3ODI0LnBkZgw4MjTov5DnrbnlraZkAicPZBYCZg8VAhJtczEwMjg3MjAxNzgyNi5wZGYSODI25bel56iL57uP5rWO5a2mZAIoD2QWAmYPFQISbXMxMDI4NzIwMTc4MjcucGRmDDgyN+e7j+a1juWtpmQCKQ9kFgJmDxUCEm1zMTAyODcyMDE3ODI5LnBkZhg4MjnorqHnrpfmnLrkuJPkuJrln7rnoYBkAioPZBYCZg8VAhJtczEwMjg3MjAxNzgzMC5wZGYSODMw6YeR5bGe5p2Q5paZ5a2mZAIrD2QWAmYPFQISbXMxMDI4NzIwMTc4MzEucGRmGzgzMeW3peeoi+e7k+aehOiuvuiuoeWOn+eQhmQCLA9kFgJmDxUCEm1zMTAyODcyMDE3ODMyLnBkZg84MzLkuqflk4Horr7orqFkAi0PZBYCZg8VAhJtczEwMjg3MjAxNzgzNC5wZGYbODM05pWw5o2u5bqT5Y6f55CG5Y+K5bqU55SoZAIuD2QWAmYPFQISbXMxMDI4NzIwMTc4MzYucGRmEjgzNueuoeeQhuWtpuWOn+eQhmQCLw9kFgJmDxUCEm1zMTAyODcyMDE3ODM4LnBkZg84Mzjkv6Hmga/mo4DntKJkAjAPZBYCZg8VAhJtczEwMjg3MjAxNzg0MC5wZGYPODQw6YOo6Zeo5rOV5a2mZAIxD2QWAmYPFQISbXMxMDI4NzIwMTc4NDIucGRmHjg0Mue/u+ivkeS4juWGmeS9nO+8iOiLseivre+8iWQCMg9kFgJmDxUCEm1zMTAyODcyMDE3ODQzLnBkZh44NDPnvo7mnK/nkIborrrkuI7kuJPkuJrmioDms5VkAjMPZBYCZg8VAhJtczEwMjg3MjAxNzg0NS5wZGYeODQ157+76K+R5LiO5YaZ5L2c77yI5pel6K+t77yJZAI0D2QWAmYPFQISbXMxMDI4NzIwMTc4NTAucGRmFTg1MOekvuS8mueglOeptuaWueazlWQCNQ9kFgJmDxUCEm1zMTAyODcyMDE3ODUyLnBkZiQ4NTLpn7PkuZDoiJ7ouYjlj7LorrrkuI7kvZzlk4HliIbmnpBkAjYPZBYCZg8VAhJtczEwMjg3MjAxNzg1My5wZGYPODUz5LiT5Lia5oqA5rOVZAI3D2QWAmYPFQISbXMxMDI4NzIwMTc4NjAucGRmFTg2MOmBk+i3r+W3peeoi+adkOaWmWQCOA9kFgJmDxUCEm1zMTAyODcyMDE3ODYxLnBkZhg4NjHopb/mlrnmlL/msrvmgJ3mg7Plj7JkAjkPZBYCZg8VAhJtczEwMjg3MjAxNzg2My5wZGYSODYz5YWs5YWx566h55CG5a2mZAI6D2QWAmYPFQISbXMxMDI4NzIwMTc4NjYucGRmGzg2NuW5v+aSreeUteinhuiJuuacr+eQhuiuumQCOw9kFgJmDxUCEm1zMTAyODcyMDE3ODY3LnBkZhU4NjfmiI/liaflj7Lorrrnu7zlkIhkAjwPZBYCZg8VAhJtczEwMjg3MjAxNzg2OC5wZGYYODY455S156a76L6Q5bCE5o6i5rWL5a2mZAI9D2QWAmYPFQISbXMxMDI4NzIwMTc4NzQucGRmHjg3NOaAneaDs+aUv+ayu+aVmeiCsuWtpuWOn+eQhmQCPg9kFgJmDxUCEm1zMTAyODcyMDE3ODc2LnBkZhU4NzbmoLjovpDlsITniannkIblraZkAj8PZBYCZg8VAhJtczEwMjg3MjAxNzg3Ny5wZGYVODc355Sf54mp5Yy75a2m5YyW5a2mZAJAD2QWAmYPFQISbXMxMDI4NzIwMTc4NzgucGRmITg3OOaVsOWtl+eUtei3r+WSjOS/oeWPt+S4juezu+e7n2QCQQ9kFgJmDxUCEm1zMTAyODcyMDE3OTE2LnBkZh05MTbmnZDmlpnlipvlraYo5LiT5Lia5a2m5L2NKWQCQg9kFgJmDxUCEm1zMTAyODcyMDE3OTE3LnBkZiA5MTflt6XnqIvng63lipvlraYo5LiT5Lia5a2m5L2NKWQCQw9kFgJmDxUCEm1zMTAyODcyMDE3OTE5LnBkZhc5MTnnlLXot68o5LiT5Lia5a2m5L2NKWQCRA9kFgJmDxUCEm1zMTAyODcyMDE3OTIwLnBkZiM5MjDoh6rliqjmjqfliLbljp/nkIYo5LiT5Lia5a2m5L2NKWQCRQ9kFgJmDxUCEm1zMTAyODcyMDE3OTIyLnBkZiw5MjLmlbDmja7nu5PmnoTkuI7mk43kvZzns7vnu58o5LiT5Lia5a2m5L2NKWQCRg9kFgJmDxUCEm1zMTAyODcyMDE3OTM2LnBkZiA5MzbnrqHnkIblrabljp/nkIYo5LiT5Lia5a2m5L2NKWQCRw9kFgJmDxUCEm1zMTAyODcyMDE3OTM3LnBkZhI5MzfmlZnogrLnrqHnkIblraZkAkgPZBYCZg8VAhJtczEwMjg3MjAxNzk3OC5wZGYdOTc45pWw5a2X55S16LevKOS4k+S4muWtpuS9jSlkGAEFHl9fQ29udHJvbHNSZXF1aXJlUG9zdEJhY2tLZXlfXxYBBQZJbWFnZTOGRmO8JF+yBmXX8s2DXKQaTU+SSw==',
    '__EVENTVALIDATION':'/wEWCgL5uOy1DwLQ0pPeBAKTxZioCwKTxczACQKTxeCnAQKTxZSLCgKTxYjuAwKTxbzVBAKTxdC4DALf2fKGA7hpA7V7GdWD3vDqtJd9XRg7gqSz',
    'drpnd':2015,
    'Image3.x':32,
    'Image3.y':14
}

date = [2018,2017,2016,2015,2014,2013,2012,2011]

def screen(url, data ,select):
    html = requests.post(url = url, data = data,headers = {'Cookie':'ASP.NET_SessionId=roe4n35500abyp45nfqb0f55'}) # 随机获取一个headers
    html.encoding = 'gbk'
    html = html.text
    soup = BeautifulSoup(html, 'lxml')
    return soup.select(select)


# sc = screen('http://gsmis.nuaa.edu.cn/zsgl/zsmlgl/sjcx_ss.aspx','table#DataList1 tr td a')
# print(sc)
hrefs = []
def get_urls(date):
    for date_tmp in date:
        data['drpnd'] = date_tmp
        sc = screen('http://gsmis.nuaa.edu.cn/zsgl/zsmlgl/sjcx_ss.aspx',data,'table#DataList1 tr td a')
        for tmp in sc:
            href = tmp.get('href')
            # print(href)
            href = href.split(".")
            href = href[2]
            num = href.split("/")
            num = num[2][11:]
            if num == '820':#此处填写3位课程代码
                href = index + href + '.pdf'
                hrefs.append(href)
        time.sleep(5)
    print(hrefs)


def dowload(url,name):
    r = requests.get(url)
    filename = firstDir + '/' + str(name) + '.pdf'
    filename = filename.replace('\\','/')
    try:
        with open(filename,'wb') as f:
            f.write(r.content)
    except Exception as e:
        os.makedirs(firstDir)
        print(e)


get_urls(date)
num = 0
for href in hrefs:
    time.sleep(3)
    print(num)
    dowload(href,date[num])
    num += 1


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

vigigo

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值