python 小说下载

hello yao
已于 2023-12-01 15:45:39 修改
阅读量30
点赞数
文章标签： python 爬虫
于 2023-12-01 15:39:32 首次发布
本文链接：https://blog.csdn.net/qq_35017903/article/details/134736414
版权
多协程下载小说
import requests
import aiohttp
import asyncio
import aiofiles
from urllib import parse
import time
from bs4 import BeautifulSoup
import warnings
import os
import json
from tqdm import tqdm

warnings.filterwarnings("ignore")


# 流程代码
def get_html_attrs(html, attrs: dict, features="html.parser", name="div"):
    bs = BeautifulSoup(html, features=features)
    bs_find = bs.find(name=name, attrs=attrs)
    return bs_find


def build_file_path(book_name):
    os.makedirs(base_dir + "/" + book_name, exist_ok=True)
    return base_dir + "/" + book_name


def build_complete_book(book_name):
    chapter_names = get_local_chapter_name_list(book_name)
    with open(f"{base_dir}/{book_name}.txt", "a", encoding=encoding) as w_f:
        for i in chapter_names:
            with open(f"{base_dir}/{book_name}/{i}", "r", encoding=encoding) as r_f:
                content = r_f.read()
            w_f.write(content)
        w_f.write("\n")


def get_local_chapter_name_list(book_name):
    return sorted_chapter([i for i in os.listdir(f"{base_dir}/{book_name}")])


def send_request(url):
    resopnse = requests.get(url, headers=headers, verify=False, timeout=50)
    return resopnse


async def download_chapter(url, title, book_name, session):
    try:
        async with session.get(f"{server_url}{url}") as resopnse:  # resp=requst.get()
            html = await resopnse.text()
            content = get_chapter_content(html)
            chapter_name = get_chapter_name(html)
            content = chapter_name + "\n\u3000\u3000" + content
        async with aiofiles.open(
            f"{base_dir}/{book_name}/{title}", mode="w", encoding=encoding
        ) as f:
            await f.write(content + "\n")
    except Exception as e:
        pass


async def download_book(chapter_dir_url, start=0, end=-1):
    """main

    Args:
        chapter_dir_url (_type_): 章节目录
        start (int, optional): 起始章节. Defaults to 0.
        end (int, optional): 结束章节. Defaults to -1.
    """
    book_name = get_book_name(chapter_dir_url)
    build_file_path(book_name)
    chapter_list = get_chapter_dir_list(chapter_dir_url)
    tasks = []
    # 有些网站是https，所以verify_ssl=False
    async with aiohttp.ClientSession(
        connector=aiohttp.TCPConnector(verify_ssl=False)
    ) as session:
        title = 1
        for chapter_name, chapter_url in chapter_list:
            current = get_chapter_number(chapter_name)
            if not current or start != 0 and current < start:
                continue
            if end != -1 and current > end:
                break
            #     tasks.append(
            #         asyncio.create_task(
            #             download_chapter(chapter_url, chapter_name, book_name, session)
            #         )
            #     )
            # await asyncio.wait(tasks)

            # 进度条功能
            tasks.append(
                download_chapter(chapter_url, f"第{str(title)}章", book_name, session)
            )
            title += 1
        for i in tqdm(
            asyncio.as_completed(tasks),
            desc=f"《{book_name}》 开始下载",
            total=len(tasks),
            ncols=100,
            unit="章",
            mininterval=0.01,
        ):
            await i

    build_complete_book(book_name)


def main(search_book_name):
    loop = asyncio.get_event_loop()
    chapter_dir_urls = get_book_url_list(search_book_name)
    if not chapter_dir_urls:
        return print(f"小说：{search_book_name} 不存在")
    for chapter_dir_url in chapter_dir_urls:
        loop.run_until_complete(download_book(chapter_dir_url))


# 定制化代码，根据不同的网站定制化提取内容
def get_chapter_dir_list(chapter_dir_url):
    """获取章节目录

    Args:
        chapter_dir_url (str): _description_

    Returns:
        list: [(chapter_name, chapter_url),]
    """
    url = f"{server_url}{chapter_dir_url}"
    resopnse = send_request(url=url)
    bs_find = get_html_attrs(
        resopnse.content,
        attrs=html_attrs["chapter_dir"]["attr"],
        name=html_attrs["chapter_dir"]["name"],
    )
    chapter_list = bs_find.find_all("a")
    return [(i.text, i.get("href")) for i in chapter_list if "展开全部章节" not in i.text]


def get_chapter_content(html):
    """获取章节内容

    Args:
        html (str): 网页
        title (str): 网页标题

    Returns:
        str: 构建好的content
    """
    bs_find = get_html_attrs(
        html,
        attrs=html_attrs["chapter_content"]["attr"],
        name=html_attrs["chapter_content"]["name"],
    )
    data = bs_find.text.split("\u3000\u3000")[2:-1]
    return "\n\u3000\u3000".join(data)


def get_chapter_name(html):
    bs_find = get_html_attrs(
        html,
        attrs=html_attrs["chapter_name"]["attr"],
        name=html_attrs["chapter_name"]["name"],
    )
    return bs_find.text


def get_book_name(chapter_dir_url):
    url = f"{server_url}{chapter_dir_url}"
    resopnse = send_request(url=url)
    bs_find = get_html_attrs(
        resopnse.content,
        attrs=html_attrs["book_name"]["attr"],
        name=html_attrs["book_name"]["name"],
    )
    book_name = bs_find.find_all("h1")
    return book_name[0].text


def get_book_url_list(book_name):
    """搜索功能，但有的时候无法搜索

    Args:
        book_name (str): 小说名字

    Returns:
        _type_: 小说信息列表
    """
    try:
        params = parse.urlencode({"q": book_name})
        resopnse = send_request(url=f"{server_url}/user/search.html?{params}")
        chapter_dir_urls = [(i["url_list"]) for i in json.loads(resopnse.content)]
    except Exception as e:
        print("搜索功能暂时无法使用")
        chapter_dir_urls = []
    return chapter_dir_urls


def get_chapter_number(name):
    try:
        data = name.split("第")[1].split("章")[0]
        if data.isdigit():
            return int(data)
        else:
            return chinese_to_number(data)
    except Exception as e:
        return False


def chinese_to_number(chinese):
    map = {"一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
    unit = {"十": 10, "百": 100, "千": 1000}
    temp = 1
    number = 0
    for i in chinese:
        if "零" == i:
            continue
        if i in unit:
            number += temp * unit[i]
            temp = 0
        else:
            temp = int(map[i])
    number += temp
    return int(number)


def sorted_chapter(data):
    return sorted(data, key=lambda x: int(get_chapter_number(x)))


if __name__ == "__main__":
    html_attrs = {
        "book_name": {"attr": {"class": "info"}, "name": "div"},
        "chapter_content": {"attr": {"id": "chaptercontent"}, "name": "div"},
        "chapter_dir": {"attr": {"class": "listmain"}, "name": "div"},
        "chapter_name": {"attr": {"class": "wap_none"}, "name": "h1"},
    }
    encoding = "utf-8"
    base_dir = "book"
    server_url = "https://www.quge9.cc"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        # "host": "www.quge9.cc",
    }
    # 单个小说下载
    chapter_dir_url = "/book/1207/"
    loop = asyncio.get_event_loop()
    loop.run_until_complete(download_book(chapter_dir_url))

    # 支持搜索功能
    # main("龙虎道主")