本人不是经常写博客的人,前些日子刚刚看完番剧想追小说有不愿意花钱买实体书。
有需要的可以参考,实现思路:
1,在目录界面获取章节名对应的url并暂存为字典或列表
2,for循环获取章节名对应的url
3,获取文章主体并保存
4,额外写一个for循环获取对应章节的复页(例如有第二页第三页的章节会以xxx_2.html , xxx_3.html结尾)
import requests
from bs4 import BeautifulSoup
import os
import time
# 创建一个目录来存储 Markdown 文件
directory = "86-不存在的战区"
if not os.path.exists(directory):
os.makedirs(directory)
# 定义目标小说的URL
url = "https://www.23qb.com/book/8204/"
URL = "https://www.23qb.com"
# 发送HTTP GET请求获取网页内容
response = requests.get(url)
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(response.text, "html.parser" )
# 找到小说章节列表
chapter_list = soup.find("ul", class_="chaw_c").find_all("li")
# 创建一个空的章节字典列表
chapters = []
# 遍历章节列表并获取每个章节的网址
for chapter in chapter_list:
# 获取章节标题和URL
chapter_title = chapter.a.text
chapter_url = chapter.a["href"]
# 创建包含章节名称和URL的字典
chapter_data = {
"name": chapter_title,
"url": chapter_url
}
# 将章节字典添加到列表中
chapters.append(chapter_data)
# 输出章节标题和URL
print("Chapter Title:", chapter_title)
print("Chapter URL:", chapter_url)
print("")
# 逐章节地爬取
for chapter in chapters:
# 获取章节名称
chapter_name = chapter["name"]
# 获取章节URL
chapter_url = chapter["url"]
# 构建章节的完整URL
chapter_full_url = URL + chapter_url # 注意此处使用的是URL不是url
# 发送HTTP GET请求获取章节内容
chapter_response = requests.get(chapter_full_url)
# 使用BeautifulSoup解析章节内容
chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")
# 找到章节正文
chapter_content = chapter_soup.find("div", id="TextContent", class_="read-content")
# 找到<div>标签下的所有<p>标签
paragraphs = chapter_content.find_all("p")
# 创建章节的 Markdown 内容
markdown_content = f"# {chapter_name}\n\n"
# 打印章节标题和内容
print("Chapter Name:", chapter_name)
print("")
# 遍历每个<p>标签并提取文本内容
for paragraph in paragraphs:
# 提取<p>标签的文本内容
paragraph_text = paragraph.text
markdown_content += f"{paragraph_text}\n\n"
# 打印正文内容
print(paragraph_text)
# 将章节保存为 Markdown 文件
chapter_filename = f"{chapter_name}.md"
chapter_filepath = os.path.join(directory, chapter_filename)
with open(chapter_filepath, "w", encoding="utf-8") as file:
file.write(markdown_content)
print(f"章节首页'{chapter_name}' 已保存为 '{chapter_filename}'")
time.sleep(1)
###########可能有第二页的爬取###########
'''
注意第一章会重复爬取!!!!!!!事后手动删除即可
'''
for page_number in range(2,21):# 2到20
next_page_url = chapter_full_url.replace(".html", "_{}.html".format(page_number))
try:
print("try",next_page_url)
# 发送HTTP GET请求获取章节内容
next_page_chapter_response = requests.get(next_page_url)
next_page_chapter_soup = BeautifulSoup(next_page_chapter_response.text, "html.parser")
# 找到章节正文
next_page_chapter_content = next_page_chapter_soup.find("div", id="TextContent", class_="read-content")
# 找到<div>标签下的所有<p>标签
next_page_paragraphs = next_page_chapter_content.find_all("p")
# 创建章节的 Markdown 内容
next_page_markdown_content = ""
# 遍历每个<p>标签并提取文本内容
for next_page_paragraph in next_page_paragraphs:
# 提取<p>标签的文本内容
next_page_paragraph_text = next_page_paragraph.text
markdown_content += f"{next_page_paragraph_text}\n\n"
# 打印正文内容
print(next_page_paragraph_text)
# 将章节的后续页面保存到同一 Markdown 文件中
with open(chapter_filepath, "a", encoding="utf-8") as file:
file.write(next_page_markdown_content)
print(f"已保存章节 '{chapter_filename}' 的第 {page_number} 页内容")
time.sleep(1)
except:
# 处理异常情况
print("Failed to next_page:", next_page_url)
break
print(f"章节所有页 '{chapter_name}' 已保存为 '{chapter_filename}'")
print("所有章节已保存为 Markdown 文件。")
# https://www.23qb.com/book/8204/4471631.html
# https://www.23qb.com/book/8204/93299416.html