智慧职教[动态爬取,模拟登录,文件写入 csv]

本文链接：https://blog.csdn.net/qq_44886567/article/details/135035619

智慧职教[动态爬取,模拟登录,文件写入 csv]

爬虫前提，是因为临近期末，女朋友需要考试题目复习。

技术栈

python
Xpath
selenium
csv

爬虫分析

问题一：在爬取网站发送请求时发现，进入智慧职教考试题库链接时。被重定向☞登录验证页面，不登陆无法访问考试题库页面。

👊 思路：起初的想法是，F 12查找表单post的发送密码和账号，通过session保持网页会话，进行爬取。(没有找到表单，果断放弃了)

👊 思路：用cookie，进行免密登录，成功了。

问题二：登录成功后，爬取后发现网页数据丢失了。

👊 思路：分析发现，需要进行动态爬取。由于自己的JavaScript弱，更加没有精力去分析智慧职教JavaScript的加载与传值过程。索性决定学习selenium进行动态爬取。

import bs4  # 导入BeautifulSoup4
from lxml import etree  # 解析方式
import time
import csv
from selenium.webdriver.chrome.service import Service  # 新增
from selenium import webdriver


def baidu():
    cookie = '隐私保护，我就不展示了'
    cookie_list = cookie.split(';')

    service = Service(executable_path='./chromedriver.exe')
    driver = webdriver.Chrome(service=service)
    driver.get("https://spoc-exam.icve.com.cn")
    for cooke in cookie_list:
        cooke_data = cooke.split('=')
        driver.add_cookie({'name': cooke_data[0].strip(), 'value': cooke_data[1].strip()})

    driver.get("https://spoc-exam.icve.com.cn/student/exam/examrecord_recordDetail.action?recordId"
               "=4b1137d842374c8182b806ad1757fb10")
    time.sleep(3)
    data_list = []
    data_list_copy = []
    xp = driver.page_source
    xml_text = etree.HTML(xp)
    print(xml_text.xpath("//div[@id='pageContent1']"))
    for i in range(1, 101):
        xml_two = xml_text.xpath("//div[@id='pageContent{num}']".format(num=i))
        for row in xml_two:
            # print(row.xpath('div/div/text()')) #二分查找，不用加/.
            title_add = row.xpath('div/div[1]/span/text()')[0] + row.xpath('div/div[1]/text()')[0]  # 题目
            title_choice_a = 'A' + row.xpath('div/div[2]/div[1]/div/text()')[0]
            title_choice_b = 'B' + row.xpath('div/div[2]/div[2]/div/text()')[0]
            title_choice_c = 'C' + row.xpath('div/div[2]/div[3]/div/text()')[0]
            title_choice_d = 'D' + row.xpath('div/div[2]/div[4]/div/text()')[0]
            title_choice_e = ''
            print('大小', len(row.xpath('div/div[2]/div')))
            if len(row.xpath('div/div[2]/div')) >= 6: # A~E选项
                title_choice_right = '答案' + row.xpath('div/div[2]/div[6]/div[2]/div[2]/span/span/text()')[0]
                title_choice_e = 'E' + row.xpath('div/div[2]/div[5]/div/text()')[0]
            if len(row.xpath('div/div[2]/div')) <= 5: # A~D选项
                title_choice_e = None
                title_choice_right = '答案' + row.xpath('div/div[2]/div[5]/div[2]/div[2]/span/span/text()')[0]
            data_list_copy = [title_add, title_choice_a, title_choice_b, title_choice_c, title_choice_d, title_choice_e,
                              title_choice_right]
            data_list.append(data_list_copy)

            # //*[@id="qContent1"]/div[2]/div[1]/div/span 选项 //*[@id="qContent4"]/div[2]/div[6]/div[2]/div[2]/span/span
    header = ['题目', '选项A', '选项B', '选项C', '选项D', '选项E', '正确答案']
    with open('D:\验光技术.csv', 'w', encoding='utf-8', newline='') as book:
        writer = csv.writer(book)
        writer.writerow(header)
        for writerRun in data_list:
            writer.writerow(writerRun)
    driver.close()


baidu()

代码我优化了一下,新增了判断题的爬取。

🐫<由于自己没有考虑面向对象的编程>有一些代码可能是重复的。但不要去改动它，一改动就需要对代码的重构，是比较麻烦的。有部分代码我是写固定了。

from lxml import etree  # 解析方式
import time  # 时间模块给网页的动态加载一点时间反应
import csv  # csv文件操作模块
from selenium.webdriver.chrome.service import Service  # 模块新增的导入驱动路径的一个方法
from selenium import webdriver  # 加载驱动


def baidu():
    cookie = '隐私保护，我就不展示了'
    cookie_list = cookie.split(';')

    service = Service(executable_path='./chromedriver.exe')
    driver = webdriver.Chrome(service=service)
    driver.get("https://spoc-exam.icve.com.cn")
    for cooke in cookie_list:
        cooke_data = cooke.split('=')
        driver.add_cookie({'name': cooke_data[0].strip(), 'value': cooke_data[1].strip()})
    recordId = '4b1137d842374c8182b806ad1757fb10'
    driver.get("https://spoc-exam.icve.com.cn/student/exam/examrecord_recordDetail.action?recordId"
               "={recordId}".format(recordId=recordId))
    # 4b1137d842374c8182b806ad1757fb10 100题（单选题）
    # 88454aa84ce8416993eea359f8a98a51 50题（单选题）
    # bd06d148582c4fa0a4877a6529071136 50题（含判断题）
    time.sleep(3)  # 等待3秒给动态加载一点时间
    data_list = []
    xp = driver.page_source  # 取网页源码赋给xp,之所以这样做是因为selenium内置元素定位与操作是真的非常不好用。
    xml_text = etree.HTML(xp)
    print(xml_text.xpath("//div[@id='pageContent1']"))
    for i in range(1, 101):  # 爬取题目的数量，这一块我就写死了。主要还是因为这个网页结构的问题与这个程序的使用次数。不然我会取父节点判断爬取次数
        if i == 28 and recordId == 'bd06d148582c4fa0a4877a6529071136':
            continue  # 由于网页结构的问题，判断题需要跳出一次循环
        xml_two = xml_text.xpath("//div[@id='pageContent{num}']".format(num=i))
        for row in xml_two:
            # print(row.xpath('div/div/text()')) #二分查找，不用加/.
            title_add = row.xpath('div/div[1]/span/text()')[0] + row.xpath('div/div[1]/text()')[0]  # 题目
            if len(row.xpath('div/div[2]/div')) >= 6:  # A~E单选题
                title_choice_a = 'A' + row.xpath('div/div[2]/div[1]/div/text()')[0]
                title_choice_b = 'B' + row.xpath('div/div[2]/div[2]/div/text()')[0]
                title_choice_c = 'C' + row.xpath('div/div[2]/div[3]/div/text()')[0]
                title_choice_d = 'D' + row.xpath('div/div[2]/div[4]/div/text()')[0]
                title_choice_right = '答案' + row.xpath('div/div[2]/div[6]/div[2]/div[2]/span/span/text()')[0]
                title_choice_e = 'E' + row.xpath('div/div[2]/div[5]/div/text()')[0]
            if len(row.xpath('div/div[2]/div')) == 5:  # A~D单选题
                title_choice_a = 'A' + row.xpath('div/div[2]/div[1]/div/text()')[0]
                title_choice_b = 'B' + row.xpath('div/div[2]/div[2]/div/text()')[0]
                title_choice_c = 'C' + row.xpath('div/div[2]/div[3]/div/text()')[0]
                title_choice_d = 'D' + row.xpath('div/div[2]/div[4]/div/text()')[0]
                title_choice_e = None
                title_choice_right = '答案' + row.xpath('div/div[2]/div[5]/div[2]/div[2]/span/span/text()')[0]
            if len(row.xpath('div/div[2]/div')) == 3:  # 判断题
                title_choice_a = '正确'
                title_choice_b = '错误'
                title_choice_c = None
                title_choice_d = None
                title_choice_e = None
                title_choice_right = '答案' + row.xpath('div/div[2]/div[3]/div[2]/div[2]/span/span/text()')[0]
            data_list_copy = [title_add, title_choice_a, title_choice_b, title_choice_c, title_choice_d, title_choice_e,
                              title_choice_right]
            data_list.append(data_list_copy)
    header = ['题目', '选项A', '选项B', '选项C', '选项D', '选项E', '正确答案']
    with open('D:\验光技术.csv', 'w', encoding='utf-8', newline='') as book:
        writer = csv.writer(book)
        writer.writerow(header)
        for writerRun in data_list:
            writer.writerow(writerRun)
    driver.close()


baidu()