结合UI自动化做出的一个抓取小说并把每个章节存放进对应的文本文档中
# coding:UTF-8
import os
from selenium import webdriver
import time
import datetime
from selenium.webdriver.common.keys import Keys
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
import re
driver = webdriver.Chrome()
driver.get("https://www.zsdade.com/")
time.sleep(2)
book_name = input("请输入要搜索的书名,然后按下回车键: ")
# 点击搜索框,输入搜索的小说名称
input_element = driver.find_element(By.XPATH, '//*[@placeholder="快速搜索、找书、找作者"]')
time.sleep(3)
#book_name = "万相之王" 也可以直接输入书名
input_element.send_keys(book_name)
time.sleep(3)
# 搜索
driver.find_element(By.XPATH, '//*[@class="btn"]').click()
time.sleep(5)
# 选择第一个
driver.find_element(By.XPATH, '(//*[@class="bookname"])[1]/a').click()
time.sleep(5)
while True:
try:
if driver.find_element(By.XPATH, '//dl/dt'):
break
except:
print("正在加载...")
time.sleep(2)
driver.maximize_window()
time.sleep(2)
#获取当前页面路径
current_url = driver.current_url
# 添加一个存放的路径
save_path = r"C:/Users/L/Desktop/谢总/"+book_name+"/"
if not os.path.exists(save_path):
os.makedirs(save_path)
else:
pass
# 遍历元素并保存在指定文件夹中,按顺序重命名文件,
directory_s = driver.find_elements(By.XPATH, '//dd/a')
chinese_text_count = 0
print("正在写入目录")
for i, directory in enumerate(directory_s):
if re.search('[\u4e00-\u9fff]', directory.text):
chinese_text_count += 1
file_name = os.path.join(save_path, f"{chinese_text_count}_{directory.text}.txt")
with open(file_name, "w") as file:
file.write(directory.text)
else:
pass
print("目录建立完成")
xu_hao = 0
while True:
xu_hao += 1
driver.get(current_url + str(xu_hao) + ".html")
print("正在写入第"+str(xu_hao)+"章")
time.sleep(2)
try:
content_element = driver.find_element(By.XPATH, '//*[@class="Readarea ReadAjax_content"]')
content = content_element.text
for root, dirs, files in os.walk(save_path):
for file in files:
if file.startswith(str(xu_hao)+'_') and file.endswith('.txt'):
file_path = os.path.join(root, file)
with open(file_path, 'a') as f:
f.write(content + '\n')
except NoSuchElementException:
print("No more content found, stopping...")
break
print("已全部完成文档建立")
time.sleep(2)
# 关闭所有进程
driver.quit()