智慧职教[动态爬取,模拟登录,文件写入 csv]
爬虫前提,是因为临近期末,女朋友需要考试题目复习。
技术栈
- python
- Xpath
- selenium
- csv
爬虫分析
问题一:在爬取网站发送请求时发现,进入智慧职教考试题库链接时。被重定向☞登录验证页面,不登陆无法访问考试题库页面。
👊 思路:起初的想法是,F 12查找表单post的发送密码和账号,通过session保持网页会话,进行爬取。(没有找到表单,果断放弃了)
👊 思路:用cookie,进行免密登录,成功了。
问题二:登录成功后,爬取后发现网页数据丢失了。
👊 思路:分析发现,需要进行动态爬取。由于自己的JavaScript弱,更加没有精力去分析智慧职教JavaScript的加载与传值过程。索性决定学习selenium进行动态爬取。
import bs4 # 导入BeautifulSoup4
from lxml import etree # 解析方式
import time
import csv
from selenium.webdriver.chrome.service import Service # 新增
from selenium import webdriver
def baidu():
cookie = '隐私保护,我就不展示了'
cookie_list = cookie.split(';')
service = Service(executable_path='./chromedriver.exe')
driver = webdriver.Chrome(service=service)
driver.get("https://spoc-exam.icve.com.cn")
for cooke in cookie_list:
cooke_data = cooke.split('=')
driver.add_cookie({'name': cooke_data[0].strip(), 'value': cooke_data[1].strip()})
driver.get("https://spoc-exam.icve.com.cn/student/exam/examrecord_recordDetail.action?recordId"
"=4b1137d842374c8182b806ad1757fb10")
time.sleep(3)
data_list = []
data_list_copy = []
xp = driver.page_source
xml_text = etree.HTML(xp)
print(xml_text.xpath("//div[@id='pageContent1']"))
for i in range(1, 101):
xml_two = xml_text.xpath("//div[@id='pageContent{num}']".format(num=i))
for row in xml_two:
# print(row.xpath('div/div/text()')) #二分查找,不用加/.
title_add = row.xpath('div/div[1]/span/text()')[0] + row.xpath('div/div[1]/text()')[0] # 题目
title_choice_a = 'A' + row.xpath('div/div[2]/div[1]/div/text()')[0]
title_choice_b = 'B' + row.xpath('div/div[2]/div[2]/div/text()')[0]
title_choice_c = 'C' + row.xpath('div/div[2]/div[3]/div/text()')[0]
title_choice_d = 'D' + row.xpath('div/div[2]/div[4]/div/text()')[0]
title_choice_e = ''
print('大小', len(row.xpath('div/div[2]/div')))
if len(row.xpath('div/div[2]/div')) >= 6: # A~E选项
title_choice_right = '答案' + row.xpath('div/div[2]/div[6]/div[2]/div[2]/span/span/text()')[0]
title_choice_e = 'E' + row.xpath('div/div[2]/div[5]/div/text()')[0]
if len(row.xpath('div/div[2]/div')) <= 5: # A~D选项
title_choice_e = None
title_choice_right = '答案' + row.xpath('div/div[2]/div[5]/div[2]/div[2]/span/span/text()')[0]
data_list_copy = [title_add, title_choice_a, title_choice_b, title_choice_c, title_choice_d, title_choice_e,
title_choice_right]
data_list.append(data_list_copy)
# //*[@id="qContent1"]/div[2]/div[1]/div/span 选项 //*[@id="qContent4"]/div[2]/div[6]/div[2]/div[2]/span/span
header = ['题目', '选项A', '选项B', '选项C', '选项D', '选项E', '正确答案']
with open('D:\验光技术.csv', 'w', encoding='utf-8', newline='') as book:
writer = csv.writer(book)
writer.writerow(header)
for writerRun in data_list:
writer.writerow(writerRun)
driver.close()
baidu()
代码我优化了一下,新增了判断题的爬取。
🐫<由于自己没有考虑面向对象的编程>有一些代码可能是重复的。但不要去改动它,一改动就需要对代码的重构,是比较麻烦的。有部分代码我是写固定了。
from lxml import etree # 解析方式
import time # 时间模块给网页的动态加载一点时间反应
import csv # csv文件操作模块
from selenium.webdriver.chrome.service import Service # 模块新增的导入驱动路径的一个方法
from selenium import webdriver # 加载驱动
def baidu():
cookie = '隐私保护,我就不展示了'
cookie_list = cookie.split(';')
service = Service(executable_path='./chromedriver.exe')
driver = webdriver.Chrome(service=service)
driver.get("https://spoc-exam.icve.com.cn")
for cooke in cookie_list:
cooke_data = cooke.split('=')
driver.add_cookie({'name': cooke_data[0].strip(), 'value': cooke_data[1].strip()})
recordId = '4b1137d842374c8182b806ad1757fb10'
driver.get("https://spoc-exam.icve.com.cn/student/exam/examrecord_recordDetail.action?recordId"
"={recordId}".format(recordId=recordId))
# 4b1137d842374c8182b806ad1757fb10 100题(单选题)
# 88454aa84ce8416993eea359f8a98a51 50题(单选题)
# bd06d148582c4fa0a4877a6529071136 50题(含判断题)
time.sleep(3) # 等待3秒给动态加载一点时间
data_list = []
xp = driver.page_source # 取网页源码赋给xp,之所以这样做是因为selenium内置元素定位与操作是真的非常不好用。
xml_text = etree.HTML(xp)
print(xml_text.xpath("//div[@id='pageContent1']"))
for i in range(1, 101): # 爬取题目的数量,这一块我就写死了。主要还是因为这个网页结构的问题与这个程序的使用次数。不然我会取父节点判断爬取次数
if i == 28 and recordId == 'bd06d148582c4fa0a4877a6529071136':
continue # 由于网页结构的问题,判断题需要跳出一次循环
xml_two = xml_text.xpath("//div[@id='pageContent{num}']".format(num=i))
for row in xml_two:
# print(row.xpath('div/div/text()')) #二分查找,不用加/.
title_add = row.xpath('div/div[1]/span/text()')[0] + row.xpath('div/div[1]/text()')[0] # 题目
if len(row.xpath('div/div[2]/div')) >= 6: # A~E单选题
title_choice_a = 'A' + row.xpath('div/div[2]/div[1]/div/text()')[0]
title_choice_b = 'B' + row.xpath('div/div[2]/div[2]/div/text()')[0]
title_choice_c = 'C' + row.xpath('div/div[2]/div[3]/div/text()')[0]
title_choice_d = 'D' + row.xpath('div/div[2]/div[4]/div/text()')[0]
title_choice_right = '答案' + row.xpath('div/div[2]/div[6]/div[2]/div[2]/span/span/text()')[0]
title_choice_e = 'E' + row.xpath('div/div[2]/div[5]/div/text()')[0]
if len(row.xpath('div/div[2]/div')) == 5: # A~D单选题
title_choice_a = 'A' + row.xpath('div/div[2]/div[1]/div/text()')[0]
title_choice_b = 'B' + row.xpath('div/div[2]/div[2]/div/text()')[0]
title_choice_c = 'C' + row.xpath('div/div[2]/div[3]/div/text()')[0]
title_choice_d = 'D' + row.xpath('div/div[2]/div[4]/div/text()')[0]
title_choice_e = None
title_choice_right = '答案' + row.xpath('div/div[2]/div[5]/div[2]/div[2]/span/span/text()')[0]
if len(row.xpath('div/div[2]/div')) == 3: # 判断题
title_choice_a = '正确'
title_choice_b = '错误'
title_choice_c = None
title_choice_d = None
title_choice_e = None
title_choice_right = '答案' + row.xpath('div/div[2]/div[3]/div[2]/div[2]/span/span/text()')[0]
data_list_copy = [title_add, title_choice_a, title_choice_b, title_choice_c, title_choice_d, title_choice_e,
title_choice_right]
data_list.append(data_list_copy)
header = ['题目', '选项A', '选项B', '选项C', '选项D', '选项E', '正确答案']
with open('D:\验光技术.csv', 'w', encoding='utf-8', newline='') as book:
writer = csv.writer(book)
writer.writerow(header)
for writerRun in data_list:
writer.writerow(writerRun)
driver.close()
baidu()
致谢
❤️ ❤️ ❤️ 最后谢谢大家的喜欢点赞与收藏,希望我的文章可以给你有所启发与灵感。