python爬虫下载＜某趣阁＞的小说-CSDN博客

本文链接：https://blog.csdn.net/awisc/article/details/138505224

python爬虫下载<某趣阁>的小说

依赖库

核心用到网络请求库及html分析库：requests，BeautifulSoup

from bs4 import BeautifulSoup
import requests
from utils.xfile import XFile

请求网页内容

随便整一个请求header, 注意，有些网站会过滤掉windows header请求，可相应地构造手机浏览器的header

    @staticmethod
    def get_response(html_url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
        response = requests.get(url=html_url, headers=headers)
        response.encoding = response.apparent_encoding
        return response

下载单个章节内容

chapter_url ：单个章节的url，然后提取出相关的章节名字及其文章内容。
file：用于保存文本内容的file对象。
最后返回下一章节的url

    def get_one_chapter(self, chapter_url, file):
        # 调用请求网页数据函数
        response = self.get_response(chapter_url)
        # 将获取网页内容加载到 BeautifulSoup分析器
        soup = BeautifulSoup(response.text, 'html.parser')
        # 章节名
        chapter_name = soup.find('div', {'class': 'bookname'}).find('h1').text
        print(f'当前章节:{chapter_name}')
        file.write(f"{chapter_name}\n")
        # 下一章节
        next_chapter = soup.find('div', {'class': 'bookname'}).find('a', {'class': 'next'})
        print(f"下一章节:{next_chapter['href']}")
        # 章节文本内容
        content_lines = soup.find('div', {'id': 'content'}).find_all('p', {'class': 'content_detail'})
        for line in content_lines:
            new_txt = line.text.replace("\r", "").replace("\n", "").replace("\t", "").strip()
            file.write(f"{new_txt}\n")

        file.write(f"\n\n")
        # 返回下一章节路径,只返回相对路径('/book/66666/777771123.html'),完整路径要自己拼
        return next_chapter['href']

下载指定章节范围的所有文章

save_path：要保存的小说路径
代码比较少，一看就明白。按照传入的文章数量循环下载每个章节。

    def download_chapters(self, save_path, chapter_count):
        """
        save_path 要保存的文件全路径,如: d:\123.txt
        chapter_count 要保存的章节数,如: 要下载6章节，则填写6
        """
        next_chapter_url = None
        # 保存的文件
        save_file = XFile(save_path, 'w+')

        for index in range(chapter_count):
            try:
                if next_chapter_url is None:
                    next_chapter_url = self.get_one_chapter(self._first_chapter_url, save_file)
                else:
                    new_url = self._root_url + next_chapter_url
                    next_chapter_url = self.get_one_chapter(new_url, save_file)
                time.sleep(0.5)
            except Exception as e:
                print(f"出错:{e}")
                break
        # 正常关闭文件以flush
        save_file.close()

整个实现类的代码


class BiQuGeNovel:
    """
    Created by cjb
    挖某趣阁的小说
    """
    def __init__(self, root_url, first_chapter_url):
        self._root_url = root_url
        self._first_chapter_url = root_url + first_chapter_url

    @staticmethod
    def get_response(html_url):
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36'
        }
        response = requests.get(url=html_url, headers=headers)
        response.encoding = response.apparent_encoding
        return response

    def download_chapters(self, save_path, chapter_count):
        """
        save_path 要保存的文件全路径,如: d:\123.txt
        chapter_count 要保存的章节数,如: 要下载6章节，则填写6
        """
        next_chapter_url = None
        # 保存的文件
        save_file = XFile(save_path, 'w+')

        for index in range(chapter_count):
            try:
                if next_chapter_url is None:
                    next_chapter_url = self.get_one_chapter(self._first_chapter_url, save_file)
                else:
                    new_url = self._root_url + next_chapter_url
                    next_chapter_url = self.get_one_chapter(new_url, save_file)
                time.sleep(0.5)
            except Exception as e:
                print(f"出错:{e}")
                break
        # 正常关闭文件以flush
        save_file.close()

    def get_one_chapter(self, chapter_url, file):
        # 调用请求网页数据函数
        response = self.get_response(chapter_url)
        # 将获取网页内容加载到 BeautifulSoup分析器
        soup = BeautifulSoup(response.text, 'html.parser')
        # 章节名
        chapter_name = soup.find('div', {'class': 'bookname'}).find('h1').text
        print(f'当前章节:{chapter_name}')
        file.write(f"{chapter_name}\n")
        # 下一章节
        next_chapter = soup.find('div', {'class': 'bookname'}).find('a', {'class': 'next'})
        print(f"下一章节:{next_chapter['href']}")
        # 章节文本内容
        content_lines = soup.find('div', {'id': 'content'}).find_all('p', {'class': 'content_detail'})
        for line in content_lines:
            new_txt = line.text.replace("\r", "").replace("\n", "").replace("\t", "").strip()
            file.write(f"{new_txt}\n")

        file.write(f"\n\n")
        # 返回下一章节路径,只返回相对路径('/book/66666/77777123.html'),完整路径要自己拼
        return next_chapter['href']

    def test_chapter(self):
        # 调用请求网页数据函数,保存为文件方便重复分析
        # response = self.get_response(self._first_chapter_url)
        # html_file = XFile('d:\\1a.txt', 'w')
        # html_file.write(response.text)
        # html_file.close()

        read_file = XFile('d:\\1a.txt', 'r')
        text = read_file.read()
        read_file.close()
        # 获取网页内容
        soup = BeautifulSoup(text, 'html.parser')
        # 获取下一页和尾页，以计算总共有多少页
        book_name = soup.find('div', {'class': 'bookname'}).find('h1').text
        print(f'书名:{book_name}')
        next_chapter = soup.find('div', {'class': 'bookname'}).find('a', {'class': 'next'})
        print(f"下一章节:{next_chapter['href']}")

        content_lines = soup.find('div', {'id': 'content'}).find_all('p', {'class': 'content_detail'})
        for line in content_lines:
            new_txt = line.text.replace("\r", "").replace("\n", "").replace("\t", "").strip()
            print(f'line:"{new_txt}"')

使用

此地址是随便填写的，真正使用时请自行搜索相关网站。

    # 输入第一章节的地址,自动从内容中获取下一章节并自动下载保存内容
    novel = BiQuGeNovel('https://www.aaaaa.vip', '/book/66666/77777660.html')
    novel.download_chapters("d:\\ddd.txt", 2)  # 下载两章内容

补充：文件读写

import os
import shutil


class XFile:
    """
    Created by cjb for reading / writing file
    读   ====> r, rb
    写   ====> w, wb
    读写 ====> r+, rb+, w+, wb+
    追加 ====> a, a+, ab, ab+
    """
    def __init__(self, file_name, mode, encoding='utf-8'):
        self.file_name = file_name
        self.file_io = open(file=self.file_name, mode=mode, encoding=encoding)

    def __del__(self):
        self.close()

    def write(self, text):
        self.file_io.write(text)

    def read(self):
        return self.file_io.read()

    def close(self):
        self.file_io.close()

    @staticmethod
    def cur_path(end_with_sep=False):
        """
        获取当前程序目录
        @param end_with_sep 是否在路径尾添加斜杠
        """
        if end_with_sep:
            return os.getcwd() + os.sep
        else:
            return os.getcwd()

    @staticmethod
    def rename(old_name, new_name):
        os.rename(old_name, new_name)

    @staticmethod
    def copy(src, dest):
        """
        复制单个文件
        """
        shutil.copy(src, dest)

    @staticmethod
    def copy_tree(src, dest):
        """
        复制整个文件夹,包含所有文件
        """
        shutil.copytree(src, dest)

    @staticmethod
    def move(src, dest):
        """
        移动单个文件,同一目录时为改名,目标目录有同名文件时覆盖文件
        """
        shutil.move(src, dest)

    @staticmethod
    def mkdir(path):
        """
        递归创建目录
        """
        os.makedirs(path)

    @staticmethod
    def get_dir(file_path):
        """
        从包含全路径的文件中提取路径并返回
        """
        return os.path.dirname(file_path)

    @staticmethod
    def get_file(file_path):
        """
        从包含全路径的文件中提取文件名并返回
        """
        return os.path.basename(file_path)

    @staticmethod
    def rm(path):
        """
        移除空文件夹或指定文件
        """
        os.remove(path)

    @staticmethod
    def rm_tree(path):
        """
        移除整个文件夹,包含所有子文件及子目录
        """
        shutil.rmtree(path)

    @staticmethod
    def scan_files(path, suffix=None):
        """
        递归获取文件夹所有文件,可过滤匹配指定后缀名suffix的文件.
        返回包含全路径的文件列表
        suffix 后缀名包含点,例如: .jpg
        """
        result = []
        for root, dirs, files in os.walk(path):
            for file in files:
                if suffix is not None:
                    if file.endswith(suffix):
                        result.append(os.path.join(root, file))
                else:
                    result.append(os.path.join(root, file))
        return result