最近突然的想爬取学校的课程表,于是经过几经努力,终于出来的一个小demo,话不多说,马上为大家讲解:先放上代码
import re
import requests
from fake_useragent import UserAgent
from pyquery import PyQuery as pq
from school_api.check_code import CHECK_CODE
class GDSchool(object):
def __init__(self):
self.ua=UserAgent() #用于随机浏览器头
self.headers={
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language':'Accept-Language',
'Host':'61.142.209.20:9090',
'Accept-Encoding':'gzip, deflate',
'UserAgent':self.ua.random
}
self.number='*******'#input('请输入学号:') 测试的时候我是直接输入了学号与密码
self.password='**********'#input('请输入密码:')
def responseMenu(self,data): #获取菜单链接
html=pq(data.text)
mainItems = {}
menu=html('#headDiv > ul li').items()
for subItems in menu:
sub_nextItems = {}
for nextItems in subItems('ul li a').items():
subList = []
sub_nextItems[nextItems.text()]=nextItems.attr('href')
subList.append(sub_nextItems)
mainItems[subItems('.top_link').text()]=subList
return mainItems
def resonseImage(self): #用于获取验证码并识别验证码,返回验证码
try:
response=self.rssions.get('http://61.142.209.20:9090/CheckCode.aspx',stream=True)
code=CHECK_CODE.verify(response.content)
# with open(code+'.gif','wb') as fp:
# fp.write(response.content)
return code
except Exception as e:
print(e)
def responseData(self): #用于获取登录data参数
try:
response=requests.get('http://61.142.209.20:9090')
html=pq(response.text)
VIEWSTATE=html('#form1 #__VIEWSTATE').attr('value')
EVENTVALIDATION=html('#form1 #__EVENTVALIDATION').attr('value')
return {'VIEWSTATE':VIEWSTATE,'EVENTVALIDATION':EVENTVALIDATION}
except Exception as e:
print(e)
def resonsePara(self): #第一次登录时的初始课表信息
url='http://61.142.209.20:9090/'+self.menu['信息查询'][0]['个人课表查询']
referer='http://61.142.209.20:9090/xs_main.aspx?xh={xh}'
xh=re.findall('xh=(.*?)&xm',url,re.S)[0]
gnmkdm=re.findall('dm=(.*?)$',url,re.S)[0]
xm=re.findall(r'xm=(.*?)&gn',url,re.S)[0]
data={
'xh':xh,
'xm':xm,
'gnmkdm':gnmkdm
}
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'Accept-Language',
'Host': '61.142.209.20:9090',
'Accept-Encoding': 'gzip, deflate',
'referer':referer.format(xh=xh), #referer是必要的,没有referer将不能登录到课表页面
'UserAgent': self.ua.random
}
reClss=self.rssions.get(url='http://61.142.209.20:9090/xskbcx.aspx?',params=data,headers=headers)
return reClss
def main(self): #主程序的调用
self.rssions=requests.session()
url='http://61.142.209.20:9090'
self.rssions.get(url=url)
url_login='http://61.142.209.20:9090/default2.aspx'
para=self.responseData() #返回了登录的参数
code=self.resonseImage() #返回验证码
data={
'__VIEWSTATE':para['VIEWSTATE'],
'__EVENTVALIDATION':para['EVENTVALIDATION'],
'TextBox1':self.number,
'TextBox2':self.password,
'TextBox3':code,
'RadioButtonList1':'%D1%A7%C9%FA',
'Button1':''
}
log_response=self.rssions.post(url=url_login,headers=self.headers,data=data)
self.menu=self.responseMenu(log_response) #菜单链接
self.name=re.findall(r'xm=(.*?)&gn',self.menu['信息查询'][0]['个人课表查询'],re.S)[0]
self.nameCode=self.name.encode('unicode_escape').decode().replace('\\u','%u')
# self.parseClassTime(self.menu)
fisrClss=self.resonsePara()
self.parseFClss(fisrClss)
def parseFClss(self, data):
print(data.text) #打印课程课程的页面
if __name__ == '__main__':
gd=GDSchool()
gd.main()
首先,先来分析一下正方教务,这里分析的是广东职业技术学院的正方教务,要获取课程信息的话,首先第一步是要模拟登录正文:
创建一个seesion保持会话,这样后面就不需要管cookies了
self.rssions=requests.session()
url='http://61.142.209.20:9090'
self.rssions.get(url=url)
通过输入错误的密码,分析asp可知登录所需的data参数:
'__VIEWSTATE':para['VIEWSTATE'],
'__EVENTVALIDATION':para['EVENTVALIDATION'],
'TextBox1':self.number,
'TextBox2':self.password,
'TextBox3':code,
'RadioButtonList1':'%D1%A7%C9%FA',
'Button1':''
其中 '__VIEWSTATE‘,‘__EVENTVALIDATION’,这两个参数可在网页中解析出来:
def responseData(self): #用于获取登录data参数
try:
response=requests.get('http://61.142.209.20:9090')
html=pq(response.text)
VIEWSTATE=html('#form1 #__VIEWSTATE').attr('value')
EVENTVALIDATION=html('#form1 #__EVENTVALIDATION').attr('value')
return {'VIEWSTATE':VIEWSTATE,'EVENTVALIDATION':EVENTVALIDATION}
except Exception as e:
print(e)
TextBox1,TextBox2,分别是学号与密码,直接输入即可,RadioButtonList1,这个参数应该是教师端还是学生端的参数,直接复制即可,Button1为空,TextBox3为验证码,通过第三方库直接实现:
def resonseImage(self): #用于获取验证码并识别验证码,返回验证码
try:
response=self.rssions.get('http://61.142.209.20:9090/CheckCode.aspx',stream=True)
code=CHECK_CODE.verify(response.content)
# with open(code+'.gif','wb') as fp:
# fp.write(response.content)
return code
except Exception as e:
print(e)
参数都获取完成后,通过seesion发起post请求就能跳转到首页页面了:
后面我是遍历了整个菜单的链接,其实大可不必,可以直接获取课程的链接:
大家只要把我菜单链接的函数重写成直接获取个课表链接的参数即可:
重写这个函数
def responseMenu(self,data): #获取菜单链接
html=pq(data.text)
mainItems = {}
menu=html('#headDiv > ul li').items()
for subItems in menu:
sub_nextItems = {}
for nextItems in subItems('ul li a').items():
subList = []
sub_nextItems[nextItems.text()]=nextItems.attr('href')
subList.append(sub_nextItems)
mainItems[subItems('.top_link').text()]=subList
return mainItems
拿到课程链接之后就可以通过seesion访问课表链接,返回response将其解析打印出来即可,这里是没有解析,直接将其html打印出来了
def resonsePara(self): #第一次登录时的初始课表信息
url='http://61.142.209.20:9090/'+self.menu['信息查询'][0]['个人课表查询']
referer='http://61.142.209.20:9090/xs_main.aspx?xh={xh}'
xh=re.findall('xh=(.*?)&xm',url,re.S)[0]
gnmkdm=re.findall('dm=(.*?)$',url,re.S)[0]
xm=re.findall(r'xm=(.*?)&gn',url,re.S)[0]
data={
'xh':xh,
'xm':xm,
'gnmkdm':gnmkdm
}
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Language': 'Accept-Language',
'Host': '61.142.209.20:9090',
'Accept-Encoding': 'gzip, deflate',
'referer':referer.format(xh=xh), #referer是必要的,没有referer将不能登录到课表页面
'UserAgent': self.ua.random
}
reClss=self.rssions.get(url='http://61.142.209.20:9090/xskbcx.aspx?',params=data,headers=headers)
return reClss
爬取过程中要注意链接中的编码,有些链接要转换之后才能使用
错误之处还望大家多多指教