Python爬虫系列-爬取驾考题目(和上期一样有Cookie验证,使用Selenium方式爬取,Python生成word文档)

如上效果图,通过https://kaocheche.com/ 这个小网站爬取。

使用前请先安装库:

pip install requests beautifulsoup4 python-docx pillow selenium webdriver-manager

 使用方法,修改下面的三个变量:

project_path = "E:\\" #设置word文档的保存目录

#.......此处省略代码.........

if __name__ == "__main__":
    docname = "2024年科目四考试题库" #设置word文档的文件名
    url = "https://kaocheche.com/tiku/kemu1/" #设置需爬取的考题目录的网址

可以爬取这个网站(  ​ https://kaocheche.com/  ​)的所有题目,包括货车、大小客车、摩托车、网约车、70岁老人三力测试。

具体代码如下:

import requests
from bs4 import BeautifulSoup as bs
from docx import Document
from docx.shared import Inches
from docx.shared import Pt
from docx.shared import RGBColor
from PIL import Image
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import time

#word 文档的保存目录
project_path = "E:\\"

def get_problem(driver, url):# 爬取考题
    driver.get(url)
    try:
        elements = WebDriverWait(driver,3).until(EC.presence_of_element_located((By.CLASS_NAME,"anchor.anchorWithHideOnScrollNavbar_WYt5")))
        problem = elements.text.strip()
        print(problem)
        picture,option = get_Picture(driver)
        answer = driver.find_elements(By.CSS_SELECTOR,"h2.anchor.anchorWithHideOnScrollNavbar_WYt5")
        if len(answer) > 1:
            answer = answer[1].text.strip()
        else:
            answer = ""
        parser = get_parser(driver)
        print(problem + "\n" + picture + "\n" + option + "\n" + answer + "\n" + parser)
        return problem,picture,option,answer,parser
    except TimeoutError:
        print(url + " 网页超时!")
        return None

def get_Picture(driver):# 爬取考题图片
    picture_elements = driver.find_element(By.CLASS_NAME,"img_ev3q")
    picture = picture_elements.get_attribute("src")
    if picture is not None:
        res = requests.get(picture)
        filename = picture.split('/')[-1]
        picture_save_path = project_path + "picture\\" + filename
        with open(picture_save_path,"wb") as f:
            f.write(res.content)
            f.close()
            picture = picture_save_path
    return picture,get_option(driver)

def get_option(driver):# 爬取答案选项
    options = driver.find_elements(By.CSS_SELECTOR,"div.theme-doc-markdown.markdown ul li")
    options = [li.text for li in options]
    option = "\n".join(options)
    return option

def get_parser(driver):# 爬取答案解析
    try:
        parsers = driver.find_element(By.CSS_SELECTOR,"span.token.plain")
        return parsers.text.strip()
    except NoSuchElementException:
        return ""

def check_image(file_path):# 检查图片是否下载正确
    try:
        with Image.open(file_path) as img:
            img.verify()  # 尝试验证图片完整性
        return True
    except (IOError, SyntaxError) as e:
        print(f"损坏的图片:{file_path},错误信息:{e}")
        return False

def WriteDocx(doc, driver, url, starttime):# 按格式写入word文档
    problem,picture,option,answer,parser = get_problem(driver, url)

    if answer == "":
        print("----------------问题:",problem,"写入失败!----------------")
        return problem,0
    # 写入问题
    paragraph1 = doc.add_paragraph()
    run1 = paragraph1.add_run(problem + "\n")
    run1.font.size = Pt(20)
    run1.bold = True
    # 插入图片
    if picture != "" and picture != project_path + "picture\\wechat-2f1dfcbb45f7f4c3a823a3f3bbb22b9d.png":
        if check_image(picture):
            doc.add_picture(picture,width=Inches(4.0))
        else:
            print(picture,"图片损坏,写入失败!")

    # 写入选项和答案
    paragraph2 = doc.add_paragraph()
    run2 = paragraph2.add_run(option + "\n")
    run2.font.size = Pt(16)
    run3 =  paragraph2.add_run(answer + "\n")
    run3.font.size = Pt(18)
    run3.font.color.rgb = RGBColor(255,0,0)
    run3.bold = True
    run4 = paragraph2.add_run(parser)
    run4.font.size = Pt(16)
    # 添加换页符
    doc.add_page_break()
    endtime = time.time()
    return problem,endtime - starttime

def getUrl(doc, driver, MainUrl):# 获得题目目录中的所有考题链接
    res = requests.get(MainUrl)
    soup = bs(res.content,"html.parser")
    Urls = soup.select("a.card.padding--lg.cardContainer_fWXF")
    Urls = [Url['href'] for Url in Urls]#.get("href")

    for url in Urls:
        starttime = time.time()
        problem, runtime = WriteDocx(doc, driver, "https://kaocheche.com" + url, starttime)
        if runtime:
            print("----------------问题:",problem,f"写入成功!(耗时{runtime} 秒)----------------")

def main(docname,url):#主函数传入两个参数,word文档名和需爬取考题目录网址
    starttime = time.time()
    # 创建一个新的Word文档
    doc = Document()

    # 向文档添加一个标题
    head = doc.add_heading(docname)
    for run in head.runs:
        run.font.size = Pt(26)

    driver = webdriver.Chrome()

    getUrl(doc, driver, url)#"https://kaocheche.com/zige/sanli/"

    doc.save(project_path + docname + ".docx")

    endtime = time.time()
    print(f"程序运行成功!共耗时{(endtime - starttime) / 60} 分钟!")

if __name__ == "__main__":
    docname = "2024年科目四考试题库" #word文档的文件名
    url = "https://kaocheche.com/tiku/kemu1/" #需爬取的考题目录的网址
    #请替换这里的题目目录,可以爬取这个网站的所有题目,包括货车、客车、摩托车、网约车、70岁老人三力测试
    #所有考试的主目录在这https://kaocheche.com/
    main(docname,url)

这个爬虫很简单,主要是写入docx的方法可以学习下,其他Selenium的使用方法,前一篇文章已说明,就不废话了,如有不懂请留言给我,谢谢观看!

PS:望大家爬取时,加个延迟,减少小网站的负担,人家开个小网站也不容易。-_-!

如果只是需要题目和答案,我有现成的,过两天上传CSDN,会更新在这个帖子里。

  • 8
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

虫鸣@蝶舞

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值