Python爬虫爬取《86-不存在的战区》小说

本人不是经常写博客的人,前些日子刚刚看完番剧想追小说有不愿意花钱买实体书。

有需要的可以参考,实现思路:
1,在目录界面获取章节名对应的url并暂存为字典或列表

2,for循环获取章节名对应的url

        3,获取文章主体并保存

        4,额外写一个for循环获取对应章节的复页(例如有第二页第三页的章节会以xxx_2.html  ,  xxx_3.html结尾)

import requests
from bs4 import BeautifulSoup
import os
import time

# 创建一个目录来存储 Markdown 文件
directory = "86-不存在的战区"
if not os.path.exists(directory):
    os.makedirs(directory)

# 定义目标小说的URL
url = "https://www.23qb.com/book/8204/"
URL = "https://www.23qb.com"

# 发送HTTP GET请求获取网页内容
response = requests.get(url)
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(response.text, "html.parser" )

# 找到小说章节列表
chapter_list = soup.find("ul", class_="chaw_c").find_all("li")
# 创建一个空的章节字典列表
chapters = []

# 遍历章节列表并获取每个章节的网址
for chapter in chapter_list:
    # 获取章节标题和URL
    chapter_title = chapter.a.text
    chapter_url = chapter.a["href"]

    # 创建包含章节名称和URL的字典
    chapter_data = {
        "name": chapter_title,
        "url": chapter_url
    }

    # 将章节字典添加到列表中
    chapters.append(chapter_data)
    # 输出章节标题和URL
    print("Chapter Title:", chapter_title)
    print("Chapter URL:", chapter_url)
    print("")

# 逐章节地爬取
for chapter in chapters:
    # 获取章节名称
    chapter_name = chapter["name"]
    # 获取章节URL
    chapter_url = chapter["url"]

    # 构建章节的完整URL
    chapter_full_url = URL + chapter_url  # 注意此处使用的是URL不是url

    # 发送HTTP GET请求获取章节内容
    chapter_response = requests.get(chapter_full_url)
    # 使用BeautifulSoup解析章节内容
    chapter_soup = BeautifulSoup(chapter_response.text, "html.parser")

    # 找到章节正文
    chapter_content = chapter_soup.find("div", id="TextContent", class_="read-content")
    # 找到<div>标签下的所有<p>标签
    paragraphs = chapter_content.find_all("p")

    # 创建章节的 Markdown 内容
    markdown_content = f"# {chapter_name}\n\n"
    # 打印章节标题和内容
    print("Chapter Name:", chapter_name)
    print("")
    # 遍历每个<p>标签并提取文本内容
    for paragraph in paragraphs:
        # 提取<p>标签的文本内容
        paragraph_text = paragraph.text
        markdown_content += f"{paragraph_text}\n\n"

        # 打印正文内容
        print(paragraph_text)

    # 将章节保存为 Markdown 文件
    chapter_filename = f"{chapter_name}.md"
    chapter_filepath = os.path.join(directory, chapter_filename)
    with open(chapter_filepath, "w", encoding="utf-8") as file:
        file.write(markdown_content)
    print(f"章节首页'{chapter_name}' 已保存为 '{chapter_filename}'")

    time.sleep(1)

    ###########可能有第二页的爬取###########
    '''
    注意第一章会重复爬取!!!!!!!事后手动删除即可
    '''
    for page_number in range(2,21):# 2到20
        next_page_url = chapter_full_url.replace(".html", "_{}.html".format(page_number))
        try:
            print("try",next_page_url)
            # 发送HTTP GET请求获取章节内容
            next_page_chapter_response = requests.get(next_page_url)
            next_page_chapter_soup = BeautifulSoup(next_page_chapter_response.text, "html.parser")

            # 找到章节正文
            next_page_chapter_content = next_page_chapter_soup.find("div", id="TextContent", class_="read-content")
            # 找到<div>标签下的所有<p>标签
            next_page_paragraphs = next_page_chapter_content.find_all("p")

            # 创建章节的 Markdown 内容
            next_page_markdown_content = ""
            # 遍历每个<p>标签并提取文本内容
            for next_page_paragraph in next_page_paragraphs:
                # 提取<p>标签的文本内容
                next_page_paragraph_text = next_page_paragraph.text
                markdown_content += f"{next_page_paragraph_text}\n\n"
                # 打印正文内容
                print(next_page_paragraph_text)
            # 将章节的后续页面保存到同一 Markdown 文件中
            with open(chapter_filepath, "a", encoding="utf-8") as file:
                file.write(next_page_markdown_content)

            print(f"已保存章节 '{chapter_filename}' 的第 {page_number} 页内容")

            time.sleep(1)

        except:
            # 处理异常情况
            print("Failed to next_page:", next_page_url)
            break

    print(f"章节所有页 '{chapter_name}' 已保存为 '{chapter_filename}'")

print("所有章节已保存为 Markdown 文件。")


# https://www.23qb.com/book/8204/4471631.html

# https://www.23qb.com/book/8204/93299416.html
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值