python爬虫自学小测-爬取次元姬小说_次元姬小说爬取-CSDN博客

本文链接：https://blog.csdn.net/qq_45138540/article/details/136510384

本文介绍了如何使用Python编写爬虫脚本，从ciyuanji.com网站抓取次元姬小说并保存到指定目录。作者提供了源码，包括获取章节链接、下载小说内容和控制请求频率等功能。

摘要由CSDN通过智能技术生成

自学python爬虫，爬取次元姬小说并保存到指定的目录下。修改小说地址和保存路径即可自动爬取，源码分享。

import requests  # 送http请求的库
from bs4 import BeautifulSoup  # 处理获取到的网页信息的库
import os
import time

urls = []
names = []
fees_urls = []
fees_names = []
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
}


def get_count(url):
    response = requests.get(url, headers=headers)
    if response.ok:
        count = response.text
        soup = BeautifulSoup(count, "html.parser")
        all_a = soup.findAll("a", attrs={"class": "Link_link__LjyZ2 book_detail_item__EMrK7"})
        fees = soup.findAll("a", attrs={"class": "Link_link__LjyZ2 book_detail_item__EMrK7 book_detail_lock__eNRvE"})
        for a in all_a:
            urls.append(f'https://www.ciyuanji.com{a["href"]}')
            names.append(a.string)
        for fee in fees:
            fees_urls.append(f'https://www.ciyuanji.com{fee["href"]}')
            fees_names.append(fee.string)


def get_fiction(url, name):
    response = requests.get(url, headers=headers)
    if response.ok:
        folder_path = r"D:\1\pythonTest\fiction"  # 小说保存地址
        if not os.path.exists(folder_path):  # 判断文件夹是否存在
            # 如果文件夹不存在，则创建它
            os.makedirs(folder_path)

        file_name = f"{name}.txt"
        # 构造完整的文件路径
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'w') as f:
            content = response.content
            soup = BeautifulSoup(content, "html.parser")
            text = soup.findAll('article', attrs={"class": "chapter_article__vWEkb"})
            all_p = str(text[0]).split('<p>')
            for p in all_p:
                if '</p>' in p and '</article>' not in p:
                    text_replace = p.replace('</p>', '')
                    f.write(text_replace + "\n")
            f.flush()
            f.close()


def main():
    get_count("https://www.ciyuanji.com/b_d_1226.html")  # 小说主页面路径
    urls.extend(fees_urls)
    names.extend(fees_names)
    for url, name in zip(urls, names):
        get_fiction(url, name)
        print(f'{name} 下载完成')
        time.sleep(10)  # 休眠10秒再发送请求


if __name__ == '__main__':
    main()