第一只爬虫,测试selenium库的使用,爬取一本书,按章节名进行保存
#!/usr/bin/env python3
"""
第一只爬虫,测试selenium库的使用,爬取一本书,按章节进行存盘
目标网页:https://www.xs1002.com/biquge43/102452/1274708.html
书名:首席御医(特喜欢此书,就用它开刀了!!!)
技术路线:Python,selenium
浏览器:Microsoft Edge
"""
import os
from selenium import webdriver
import time
def getHTMLtext(driver):
"""获取章节内容"""
comment = driver.find_element_by_id('booktext')
sectionText = comment.text
return sectionText
def saveText(chapterName, text):
"""按章节名保存章节内容"""
savePath = r"首席御医/{0}.txt"
savePath = savePath.format(chapterName)
file = open(savePath, "w")
file.write(text)
file.close()
def nextPage(driver):
"""自动跳转至下一章"""
time.sleep(2) # 歇口气
try:
nextPage = driver.find_element_by_link_text('下一章')
nextPage.click()
except:
errorChapter.append(driver.current_url) # 记录无法打开的章节
def getChapterName(driver):
"""获取章节名称"""
return driver.find_element_by_tag_name('dt').text
if __name__ == "__main__":
"""
你可以从任意一章开始爬取,只需要改变url的值
"""
url = r"https://www.xs1002.com/biquge43/102452/1274708.html"
chapterNum = 10 # 爬取的章节数量
errorChapter = [] # 记录无法打开的章节
bookName = "首席御医"
driver = webdriver.Edge(r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
driver.maximize_window() # 最大化窗口
try:
driver.get(url)
except:
errorChapter.append(driver.current_url)
for i in range(10):
chapterText = getHTMLtext(driver)
chapterName = "{0}-{1:0=4} ({2})".format(bookName, i+1, getChapterName(driver))
saveText(chapterName, chapterText)
nextPage(driver)
driver.close()