使用selenium爬取数据并存储到数据库_selenium爬取数据到本地文件-CSDN博客

本文链接：https://blog.csdn.net/2301_78319341/article/details/142149381

爬取小说

from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
import pymongo

# 将chromedriver.exe放在当前py同一目录
driver = Chrome(service=Service(executable_path=r"chromedriver.exe"))
# 对比得出小说总长度为1-2363
# https://www.00ksw.com/html/1/1170/14041.html
# https://www.00ksw.com/html/1/1170/14042363.html
url = "https://www.00ksw.com/html/1/1170/14041.html"
# 打开要爬取小说的网页的第一页
driver.get(url)
# 等待10秒时间。如果在10秒内找到元素，则立即继续执行；
# 如果10秒内没有找到元素，将会抛出一个 NoSuchElementException 异常
# driver.implicitly_wait(10)
# 设置页面加载超时时间为30秒
driver.set_page_load_timeout(30)
content=[]
# 连接创建数据库与集合
client = pymongo.MongoClient()
db = client.get_database("lingdian")
collection=db.get_collection("蛊真人")
# 循环爬取所有章节
i = 1
while i < 2364:
    print(f"正在爬取第{i}页")
    # 提取每章的标题名
    title = driver.find_element(By.XPATH, '//*[@id="nr_content"]/div[2]/h3').text
    # 提取每章的段落内容
    datas = driver.find_elements(By.XPATH, '//*[@id="articlecontent"]/p')
    # 存放段落内容的文本信息
    text=[]
    for data in datas:
        text.append(data.text)
    # 每提取一章就存入数据库一章
    collection.insert_one({"title":title,"datas":text})
    try:
        # 点击下一章
        driver.find_element(By.XPATH, '// div[ @ id = "nr_content"] / div[7] / a[4]').click()
    except TimeoutException:
        driver.get(f"https://www.00ksw.com/html/1/1170/1404{i+1}.html")
    i += 1
# 关闭数据库
client.close()
print("爬取结束")