多协程下载小说
import requests
import aiohttp
import asyncio
import aiofiles
from urllib import parse
import time
from bs4 import BeautifulSoup
import warnings
import os
import json
from tqdm import tqdm
warnings.filterwarnings("ignore")
# 流程代码
def get_html_attrs(html, attrs: dict, features="html.parser", name="div"):
bs = BeautifulSoup(html, features=features)
bs_find = bs.find(name=name, attrs=attrs)
return bs_find
def build_file_path(book_name):
os.makedirs(base_dir + "/" + book_name, exist_ok=True)
return base_dir + "/" + book_name
def build_complete_book(book_name):
chapter_names = get_local_chapter_name_list(book_name)
with open(f"{base_dir}/{book_name}.txt", "a", encoding=encoding) as w_f:
for i in chapter_names:
with open(f"{base_dir}/{book_name}/{i}", "r", encoding=encoding) as r_f:
content = r_f.read()
w_f.write(content)
w_f.write("\n")
def get_local_chapter_name_list(book_name):
return sorted_chapter([i for i in os.listdir(f"{base_dir}/{book_name}")])
def send_request(url):
resopnse = requests.get(url, headers=headers, verify=False, timeout=50)
return resopnse
async def download_chapter(url, title, book_name, session):
try:
async with session.get(f"{server_url}{url}") as resopnse: # resp=requst.get()
html = await resopnse.text()
content = get_chapter_content(html)
chapter_name = get_chapter_name(html)
content = chapter_name + "\n\u3000\u3000" + content
async with aiofiles.open(
f"{base_dir}/{book_name}/{title}", mode="w", encoding=encoding
) as f:
await f.write(content + "\n")
except Exception as e:
pass
async def download_book(chapter_dir_url, start=0, end=-1):
"""main
Args:
chapter_dir_url (_type_): 章节目录
start (int, optional): 起始章节. Defaults to 0.
end (int, optional): 结束章节. Defaults to -1.
"""
book_name = get_book_name(chapter_dir_url)
build_file_path(book_name)
chapter_list = get_chapter_dir_list(chapter_dir_url)
tasks = []
# 有些网站是https,所以verify_ssl=False
async with aiohttp.ClientSession(
connector=aiohttp.TCPConnector(verify_ssl=False)
) as session:
title = 1
for chapter_name, chapter_url in chapter_list:
current = get_chapter_number(chapter_name)
if not current or start != 0 and current < start:
continue
if end != -1 and current > end:
break
# tasks.append(
# asyncio.create_task(
# download_chapter(chapter_url, chapter_name, book_name, session)
# )
# )
# await asyncio.wait(tasks)
# 进度条功能
tasks.append(
download_chapter(chapter_url, f"第{str(title)}章", book_name, session)
)
title += 1
for i in tqdm(
asyncio.as_completed(tasks),
desc=f"《{book_name}》 开始下载",
total=len(tasks),
ncols=100,
unit="章",
mininterval=0.01,
):
await i
build_complete_book(book_name)
def main(search_book_name):
loop = asyncio.get_event_loop()
chapter_dir_urls = get_book_url_list(search_book_name)
if not chapter_dir_urls:
return print(f"小说:{search_book_name} 不存在")
for chapter_dir_url in chapter_dir_urls:
loop.run_until_complete(download_book(chapter_dir_url))
# 定制化代码,根据不同的网站定制化提取内容
def get_chapter_dir_list(chapter_dir_url):
"""获取章节目录
Args:
chapter_dir_url (str): _description_
Returns:
list: [(chapter_name, chapter_url),]
"""
url = f"{server_url}{chapter_dir_url}"
resopnse = send_request(url=url)
bs_find = get_html_attrs(
resopnse.content,
attrs=html_attrs["chapter_dir"]["attr"],
name=html_attrs["chapter_dir"]["name"],
)
chapter_list = bs_find.find_all("a")
return [(i.text, i.get("href")) for i in chapter_list if "展开全部章节" not in i.text]
def get_chapter_content(html):
"""获取章节内容
Args:
html (str): 网页
title (str): 网页标题
Returns:
str: 构建好的content
"""
bs_find = get_html_attrs(
html,
attrs=html_attrs["chapter_content"]["attr"],
name=html_attrs["chapter_content"]["name"],
)
data = bs_find.text.split("\u3000\u3000")[2:-1]
return "\n\u3000\u3000".join(data)
def get_chapter_name(html):
bs_find = get_html_attrs(
html,
attrs=html_attrs["chapter_name"]["attr"],
name=html_attrs["chapter_name"]["name"],
)
return bs_find.text
def get_book_name(chapter_dir_url):
url = f"{server_url}{chapter_dir_url}"
resopnse = send_request(url=url)
bs_find = get_html_attrs(
resopnse.content,
attrs=html_attrs["book_name"]["attr"],
name=html_attrs["book_name"]["name"],
)
book_name = bs_find.find_all("h1")
return book_name[0].text
def get_book_url_list(book_name):
"""搜索功能,但有的时候无法搜索
Args:
book_name (str): 小说名字
Returns:
_type_: 小说信息列表
"""
try:
params = parse.urlencode({"q": book_name})
resopnse = send_request(url=f"{server_url}/user/search.html?{params}")
chapter_dir_urls = [(i["url_list"]) for i in json.loads(resopnse.content)]
except Exception as e:
print("搜索功能暂时无法使用")
chapter_dir_urls = []
return chapter_dir_urls
def get_chapter_number(name):
try:
data = name.split("第")[1].split("章")[0]
if data.isdigit():
return int(data)
else:
return chinese_to_number(data)
except Exception as e:
return False
def chinese_to_number(chinese):
map = {"一": 1, "二": 2, "三": 3, "四": 4, "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
unit = {"十": 10, "百": 100, "千": 1000}
temp = 1
number = 0
for i in chinese:
if "零" == i:
continue
if i in unit:
number += temp * unit[i]
temp = 0
else:
temp = int(map[i])
number += temp
return int(number)
def sorted_chapter(data):
return sorted(data, key=lambda x: int(get_chapter_number(x)))
if __name__ == "__main__":
html_attrs = {
"book_name": {"attr": {"class": "info"}, "name": "div"},
"chapter_content": {"attr": {"id": "chaptercontent"}, "name": "div"},
"chapter_dir": {"attr": {"class": "listmain"}, "name": "div"},
"chapter_name": {"attr": {"class": "wap_none"}, "name": "h1"},
}
encoding = "utf-8"
base_dir = "book"
server_url = "https://www.quge9.cc"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
# "host": "www.quge9.cc",
}
# 单个小说下载
chapter_dir_url = "/book/1207/"
loop = asyncio.get_event_loop()
loop.run_until_complete(download_book(chapter_dir_url))
# 支持搜索功能
# main("龙虎道主")