# -*- coding: utf-8 -*-
"""
Created on 2021/1/11 14:30
爬取UJN毕业设计题目信息
@author: 大牛牛
"""
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.wait import WebDriverWait
import time
import os
import re
import requests
import urllib.request
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
def getujnQuest(issueurl, loginame, password):
browser = webdriver.Chrome()
urlsets = set()
try:
browser.get('http://sso.ujn.edu.cn/tpass/login?service=http://bysj.ujn.edu.cn/sso_isebysj.php') # 统一身份认证网址
wait = WebDriverWait(browser, 10) # 等待页面加载完成
wait.until(EC.presence_of_all_elements_located((By.ID, 'rsa')))
time.sleep(3)
elem1 = browser.find_element_by_id("un") # 账号
elem1.send_keys(loginame)
elem2 = browser.find_element_by_id("pd") # 密码
elem2.send_keys(password)
# 提交表单
element = browser.find_element_by_xpath("//input[@id='index_login_btn']")
element.click()
time.sleep(2)
print('认证完成!')
browser.get('http://bysj.ujn.edu.cn/ise_bysj/Quest/search.asp') # 选题信息汇总查询界面网址
wait = WebDriverWait(browser, 10) # 等待页面加载完成
wait.until(EC.presence_of_all_elements_located((By.XPATH, '//select[@name="Specialty"]')))
time.sleep(3)
rso = browser.find_element_by_xpath("//select[@name='id1']") # 一个节点是可以继续查找的
links = rso.find_elements_by_tag_name('option') # 获取全部a标签
urls = list(links)
dict1 = dict()
'''
爬虫编程大坑总结:
不要直接循环获取 links = rso.find_elements_by_tag_name('option') # 获取全部a标签
links这个变量中存的不是真实的HTML内容,而是动态的emelentid和session,一旦当前界面发生跳转,这些内容就会发生变化,就会出现”element is not attached to the page document”错误
因此需要先将需要的HTML真实内容提取出来,存到一个字典中,然后去循环字典,就可以实现多界面跳转。
'''
# 取出老师姓名和工号
for x in range(len(urls)):
if urls[x] != None:
url = urls[x].get_attribute('value')
teaname = urls[x].text #老师姓名
if len(url) > 5:
dict1[teaname] = url
for key in dict1:
browser.get('http://bysj.ujn.edu.cn/ise_bysj/Quest/search.asp') # 选题信息汇总查询界面网址
wait = WebDriverWait(browser, 10) # 等待页面加载完成
wait.until(EC.presence_of_all_elements_located((By.XPATH, '//select[@name="Specialty"]')))
time.sleep(3)
teacher = browser.find_element_by_name("id1")
Select(teacher).select_by_value(dict1[key])
# 提交表单
submit = browser.find_element_by_xpath("//input[@name='action']")
submit.click()
browser.implicitly_wait(3)
subjectlinks = browser.find_elements_by_tag_name('a') # 获取全部a标签
lst1 = list(subjectlinks)
dict2 = dict()
# 取出老师老师所有题目及链接
for i in lst1:
subjecturl = i.get_attribute('href')
subjectname = i.text # 题目名称
if subjecturl != None and subjecturl.startswith('http://bysj.ujn.edu.cn/ise_bysj/Quest/showit.asp'):
dict2[subjectname] = subjecturl
for l in dict2:
browser.get(dict2[l])
browser.implicitly_wait(3)
contents = browser.find_element_by_tag_name('body')
teapath = issueurl + re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "",
key) + "\\"
if os.path.exists(teapath):
pass
else:
os.mkdir(teapath)
dfname = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "",
l)
f1 = open(teapath + dfname + ".txt", 'w+', encoding='utf-8')
f1.write(contents.text)
f1.close()
finally:
browser.close()
print("\n毕业设计题目信息抓取完毕!")
if __name__ == '__main__':
# 定义常量 后面可以改成读取配置文件
print("start------------------")
starturl = 'http://bysj.ujn.edu.cn/ise_bysj/index.asp' # 毕业设计选题网址首页
formurl = 'http://bysj.ujn.edu.cn/ise_bysj/Quest/search.asp' #汇总查询界面
loginame = 'xxxxxxxxxxx'
password = 'xxxxxxxxxxx'
issuepath = 'D:\\爬虫结果\\ujnsubject\\' # 定义毕业设计题目的txt文件地址
# 2、抓取软件学报pdf下载链接并下载PDF
getujnQuest(issuepath, loginame, password)
print("end------------------")