import requests
from bs4 import BeautifulSoup
# 章节目录中寻找每一章的ip地址
def chapters():
# 小说的Ip地址,使用前查看网站是否允许爬取,本篇仅供学习交流,不得用于违法,后果自负
base_url = ""
resp = requests.get(base_url)
resp.encoding = "utf-8"
bf = BeautifulSoup(resp.text, "html.parser")
# 列表存放(每一章的url,章节名),元组的形式存放
data = []
for li in bf.find("ul", id="htmlChapterList").find_all("li"):
link_a = li.find("a")
if not link_a:
continue
# print(link_a)
data.append((link_a["href"], link_a.get_text()))
print(data)
# break
return data
# 返回网页的url文本
def content_novel(url):
res = requests.get(url)
res.encoding = "UTF-8"
bf = BeautifulSoup(res.text, "html.parser")
if not bf.find("div", id="htmlContent"):
return None
return bf.find("div", id="htmlContent").get_text()
novels = chapters()
numbers = len(novels)
ids = 0
# 保存下载的小说,以章节的形式保存
for novel_title in chapters():
url, title = novel_title
ids += 1
print("download: "+str(ids)+","+"total:"+ str(numbers))
with open("%s.txt"%title, 'w', encoding="UTF-8") as fwrite:
if not content_novel(url):
continue
fwrite.write(content_novel(url))
# print(content_novel(url))
# break
BeautifulSoup下载小说
于 2023-11-18 10:53:29 首次发布