我的第一个爬虫给了教务系统的个人课表是有原因的。刚升学那会,老师在介绍我们的专业时,说到了我们以后会学习爬虫技术,使用爬虫采集数据以进行数据可视化等一系列操作,当时让我印象深刻的是,爬虫可以做很多有趣的事啦,比如爬取我们个人课表数据之后用程序实现自动化操作,每天将当天的课表发送到自己的手机或其他电子终端,就可以不用上课前忙着登录各种软件看课表了。不禁感慨:哇,很牛逼的样子啊!
好啦,开始爬取我们的个人课表!
- 教务系统的登录界面。
- 简单分析一下:
- 只要学号密码输入正确了就可以
- 没有验证码(爬取过程也就简单了些许)
- 简单分析课表页面:
页面部分源码:
- 代码实现过程:
首先是登陆界面实现自动化检查学号密码输入框并正确填入信息后点击登录,在这里主要用到的是selenium模块了,负责动态渲染页面抓取:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
account = 'user_account'
password = 'user_password'
def userLogin():
try:
browser.get('http://教务系统网址.edu.cn/')
user_account = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[type=text]')))
user_password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[type=password]')))
user_account.send_keys(account)
user_password.send_keys(password)
login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '[type=button]')))
login_button.click()
except TimeoutException:
print("timeout")
再而,当我们成功登录进去后,进行的是根据需求抓取页面数据,这里的课表是在iFrame页面,使用webdriver里的switch_to_frame进行切换并抓取课表所在节点(table):
def getSourceCode():
time.sleep(3)
browser.switch_to.frame(0)
classSelectValue = '94FE8AC636A218B0E05377320A0AEB65'
select_button = browser.find_element_by_tag_name("select")
Select(select_button).select_by_value(classSelectValue)
time.sleep(3)
sourceCode = browser.page_source
doc = pq(sourceCode)
table = doc.find('table')
return table
到这就进行提取课表相关信息了,主要是th和td节点里的信息:
def parsePage(table):
ths = table.find('th').items()
for th in ths:
filednames.append(th.text())
tds = table.find('td').items()
for td in tds:
text = selectLine(td)
knames.append(text)
利用正则表达式匹配出自己需要的的内容,时间点有课的就把它匹配出来,没有课就返回原内容
def selectLine(td):
pattern = re.compile('课程学分.*?课程名称:(.*?)<br/>.*?<br/>上课地点:(.*)', re.S)
try:
p = td.find('p')
title = p.attr('title')
items = re.findall(pattern, title)
for item in items:
name = item[0]
local = item[1]
return name + '\n' + local
except:
text = td.text()
return text
通过迭代获取数据的那个列表里的内容然后保存到csv文件里
def sava_to_csv():
rows = []
for i in range(7):
row = {}
for j in range(8):
row[filednames[j]] = knames[i * 8:i * 8 + 8][j]
rows.append(row)
print(rows)
with open('data.csv','w',newline='') as csvfile:
writer = csv.DictWriter(csvfile,fieldnames=filednames)
writer.writeheader()
for i in range(len(rows)):
print(rows[i])
writer.writerow(rows[i])
附上最后的源码:
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import Select
import time
import re
import csv
from pyquery import PyQuery as pq
browser = webdriver.Chrome()
wait = WebDriverWait(browser, 10)
account = 'user_account'
password = 'user_password'
filednames = []
knames = []
def userLogin():
try:
browser.get('http://jwxt.gdaib.edu.cn/')
user_account = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[type=text]')))
user_password = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '[type=password]')))
user_account.send_keys(account)
user_password.send_keys(password)
login_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '[type=button]')))
login_button.click()
table = getSourceCode()
parsePage(table)
except TimeoutException:
print("timeout")
def getSourceCode():
time.sleep(3)
browser.switch_to.frame(0)
classSelectValue = '94FE8AC636A218B0E05377320A0AEB65'
select_button = browser.find_element_by_tag_name("select")
Select(select_button).select_by_value(classSelectValue)
time.sleep(3)
sourceCode = browser.page_source
#print(sourceCode)
doc = pq(sourceCode)
table = doc.find('table')
return table
def parsePage(table):
ths = table.find('th').items()
for th in ths:
filednames.append(th.text())
tds = table.find('td').items()
for td in tds:
text = selectLine(td)
knames.append(text)
#print(filednames)
#print(knames)
sava_to_csv()
def selectLine(td):
pattern = re.compile('课程学分.*?课程名称:(.*?)<br/>.*?<br/>上课地点:(.*)', re.S)
try:
p = td.find('p')
title = p.attr('title')
items = re.findall(pattern, title)
for item in items:
name = item[0]
local = item[1]
return name + '\n' + local
except:
text = td.text()
return text
def sava_to_csv():
rows = []
for i in range(7):
row = {}
for j in range(8):
row[filednames[j]] = knames[i * 8:i * 8 + 8][j]
rows.append(row)
print(rows)
with open('data.csv','w',newline='') as csvfile:
writer = csv.DictWriter(csvfile,fieldnames=filednames)
writer.writeheader()
for i in range(len(rows)):
print(rows[i])
writer.writerow(rows[i])
if __name__ == '__main__':
userLogin()
打开保存数据的data.csv文件,效果内容如下:
虽说这并不是最好最优的方案,但希望能给大家带来参考价值!
欢迎关注程序员杂谈公众号!