第一只爬虫（selenium库的使用）_selenium第一个爬虫-CSDN博客

本文链接：https://blog.csdn.net/weixin_39433884/article/details/105606616

第一只爬虫（selenium库的使用）

第一只爬虫，测试selenium库的使用，爬取一本书，按章节名进行保存

第一只爬虫，测试selenium库的使用，爬取一本书，按章节名进行保存

#!/usr/bin/env python3
"""
第一只爬虫，测试selenium库的使用，爬取一本书，按章节进行存盘
目标网页：https://www.xs1002.com/biquge43/102452/1274708.html
书名：首席御医（特喜欢此书，就用它开刀了！！！）
技术路线：Python,selenium
浏览器：Microsoft Edge
"""
import os
from selenium import webdriver
import time


def getHTMLtext(driver):
	"""获取章节内容"""
	comment = driver.find_element_by_id('booktext')
	sectionText = comment.text
	return sectionText


def saveText(chapterName, text):
	"""按章节名保存章节内容"""
	savePath = r"首席御医/{0}.txt"
	savePath = savePath.format(chapterName)

	file = open(savePath, "w")
	file.write(text)
	file.close()


def nextPage(driver):
	"""自动跳转至下一章"""
	time.sleep(2)  # 歇口气
	try:
		nextPage = driver.find_element_by_link_text('下一章')
		nextPage.click()
	except:
		errorChapter.append(driver.current_url)     # 记录无法打开的章节


def getChapterName(driver):
	"""获取章节名称"""
	return driver.find_element_by_tag_name('dt').text


if __name__ == "__main__":
	"""
	你可以从任意一章开始爬取，只需要改变url的值
	"""
	url = r"https://www.xs1002.com/biquge43/102452/1274708.html"

	chapterNum = 10  # 爬取的章节数量
	errorChapter = []  # 记录无法打开的章节
	bookName = "首席御医"
	driver = webdriver.Edge(r"C:\Program Files (x86)\Microsoft\Edge\Application\msedgedriver.exe")
	driver.maximize_window()  # 最大化窗口

	try:
		driver.get(url)
	except:
		errorChapter.append(driver.current_url)

	for i in range(10):
		chapterText = getHTMLtext(driver)
		chapterName = "{0}-{1:0=4} ({2})".format(bookName, i+1, getChapterName(driver))
		saveText(chapterName, chapterText)

		nextPage(driver)

	driver.close()