python 小说下载

多协程下载小说

import requests
import aiohttp
import asyncio
import aiofiles
from urllib import parse
import time
from bs4 import BeautifulSoup
import warnings
import os
import json
from tqdm import tqdm

warnings.filterwarnings("ignore")


# 流程代码
def get_html_attrs(html, attrs: dict, features="html.parser", name="div"):
    bs = BeautifulSoup(html, features=features)
    bs_find = bs.find(name=name, attrs=attrs)
    return bs_find


def build_file_path(book_name):
    os.makedirs(base_dir + "/" + book_name, exist_ok=True)
    return base_dir + "/" + book_name


def build_complete_book(book_name):
    chapter_names = get_local_chapter_name_list(book_name)
    with open(f"{base_dir}/{book_name}.txt", "a", encoding=encoding) as w_f:
        for i in chapter_names:
            with open(f"{base_dir}/{book_name}/{i}", "r", encoding=encoding) as r_f:
                content = r_f.read()
            w_f.write(content)
        w_f.write("\n")


def get_local_chapter_name_list(book_name):
    return sorted_chapter([i for i in os.listdir(f"{base_dir}/{book_name}")])


def send_request(url):
    resopnse = requests.get(url, headers=headers, verify=False, timeout=50)
    return resopnse


async def download_chapter(url, title, book_name, session):
    try:
        async with session.get(f"{server_url}{url}") as resopnse:  # resp=requst.get()
            html = await resopnse.text()
            content = get_chapter_content(html)
            chapter_name = get_chapter_name(html)
            content = chapter_name + "\n\u3000\u3000" + content
        async with aiofiles.open(
            f"{base_dir}/{book_name}/{title}", mode="w", encoding=encoding
        ) as f:
            await f.write(content + "\n")
    except Exception as e:
        pass


async def download_book(chapter_dir_url, start=0, end=-1):
    """main

    Args:
        chapter_dir_url (_type_): 章节目录
        start (int, optional): 起始章节. Defaults to 0.
        end (int, optional): 结束章节. Defaults to -1.
    """
    book_name = get_book_name(chapter_dir_url)
    build_file_path(book_name)
    chapter_list = get_chapter_dir_list(chapter_dir_url)
    tasks = []
    # 有些网站是https,所以verify_ssl=False
    async with aiohttp.ClientSession(
        connector=aiohttp.TCPConnector(verify_ssl=False)
    ) as session:
        title = 1
        for chapter_name, chapter_url in chapter_list:
            current = get_chapter_number(chapter_name)
            if not current or start != 0 and current < start:
                continue
            if end != -1 and current > end:
                break
            #     tasks.append(
            #         asyncio.create_task(
            #             download_chapter(chapter_url, chapter_name, book_name, session)
            #         )
            #     )
            # await asyncio.wait(tasks)

            # 进度条功能
            tasks.append(
                download_chapter(chapter_url, f"第{str(title)}章", book_name, session)
            )
            title += 1
        for i in tqdm(
            asyncio.as_completed(tasks),
            desc=f"《{book_name}》 开始下载",
            total=len(tasks),
            ncols=100,
            unit="章",
            mininterval=0.01,
        ):
            await i

    build_complete_book(book_name)


def main(search_book_name):
    loop = asyncio.get_event_loop()
    chapter_dir_urls = get_book_url_list(search_book_name)
    if not chapter_dir_urls:
        return print(f"小说:{search_book_name} 不存在")
    for chapter_dir_url in chapter_dir_urls:
        loop.run_until_complete(download_book(chapter_dir_url))


# 定制化代码,根据不同的网站定制化提取内容
def get_chapter_dir_list(chapter_dir_url):
    """获取章节目录

    Args:
        chapter_dir_url (str): _description_

    Returns:
        list: [(chapter_name, chapter_url),]
    """
    url = f"{server_url}{chapter_dir_url}"
    resopnse = send_request(url=url)
    bs_find = get_html_attrs(
        resopnse.content,
        attrs=html_attrs["chapter_dir"]["attr"],
        name=html_attrs["chapter_dir"]["name"],
    )
    chapter_list = bs_find.find_all("a")
    return [(i.text, i.get("href")) for i in chapter_list if "展开全部章节" not in i.text]


def get_chapter_content(html):
    """获取章节内容

    Args:
        html (str): 网页
        title (str): 网页标题

    Returns:
        str: 构建好的content
    """
    bs_find = get_html_attrs(
        html,
        attrs=html_attrs["chapter_content"]["attr"],
        name=html_attrs["chapter_content"]["name"],
    )
    data = bs_find.text.split("\u3000\u3000")[2:-1]
    return "\n\u3000\u3000".join(data)


def get_chapter_name(html):
    bs_find = get_html_attrs(
        html,
        attrs=html_attrs["chapter_name"]["attr"],
        name=html_attrs["chapter_name"]["name"],
    )
    return bs_find.text


def get_book_name(chapter_dir_url):
    url = f"{server_url}{chapter_dir_url}"
    resopnse = send_request(url=url)
    bs_find = get_html_attrs(
        resopnse.content,
        attrs=html_attrs["book_name"]["attr"],
        name=html_attrs["book_name"]["name"],
    )
    book_name = bs_find.find_all("h1")
    return book_name[0].text


def get_book_url_list(book_name):
    """搜索功能,但有的时候无法搜索

    Args:
        book_name (str): 小说名字

    Returns:
        _type_: 小说信息列表
    """
    try:
        params = parse.urlencode({"q": book_name})
        resopnse = send_request(url=f"{server_url}/user/search.html?{params}")
        chapter_dir_urls = [(i["url_list"]) for i in json.loads(resopnse.content)]
    except Exception as e:
        print("搜索功能暂时无法使用")
        chapter_dir_urls = []
    return chapter_dir_urls


def get_chapter_number(name):
    try:
        data = name.split("第")[1].split("章")[0]
        if data.isdigit():
            return int(data)
        else:
            return chinese_to_number(data)
    except Exception as e:
        return False


def chinese_to_number(chinese):
    map = {"一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
    unit = {"十": 10, "百": 100, "千": 1000}
    temp = 1
    number = 0
    for i in chinese:
        if "零" == i:
            continue
        if i in unit:
            number += temp * unit[i]
            temp = 0
        else:
            temp = int(map[i])
    number += temp
    return int(number)


def sorted_chapter(data):
    return sorted(data, key=lambda x: int(get_chapter_number(x)))


if __name__ == "__main__":
    html_attrs = {
        "book_name": {"attr": {"class": "info"}, "name": "div"},
        "chapter_content": {"attr": {"id": "chaptercontent"}, "name": "div"},
        "chapter_dir": {"attr": {"class": "listmain"}, "name": "div"},
        "chapter_name": {"attr": {"class": "wap_none"}, "name": "h1"},
    }
    encoding = "utf-8"
    base_dir = "book"
    server_url = "https://www.quge9.cc"
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        # "host": "www.quge9.cc",
    }
    # 单个小说下载
    chapter_dir_url = "/book/1207/"
    loop = asyncio.get_event_loop()
    loop.run_until_complete(download_book(chapter_dir_url))

    # 支持搜索功能
    # main("龙虎道主")

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值