完整代码:
# -*- coding: utf-8 -*-
"""
@email: bluechai@qq.com
@author: NiceBlueChai
"""
import requests
from bs4 import BeautifulSoup as sp
import os
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from urllib.parse import urljoin
# 请求HTML
def getHTMLText(url, timeout=3000) -> str:
try:
r = requests.get(url, timeout=timeout)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except Exception as e:
print("Requeste Error: ", format(e))
return None
# 获取小说章节列表的链接和章节名
def getUrls(baseUrl='https://www.kanunu8.com/book3/6633/', timeout=30) -> list:
urls = []
text = getHTMLText(baseUrl, timeout)
if text is not None:
soup = sp(text, 'lxml')
table = soup.find('table', {'cellspacing': 1, 'bgcolor': '#d4d0c8'})
alist = table.findAll('a')
urls = [(a.get_text().strip(), urljoin(baseUrl, a.attrs['href']))
for a in alist]
return urls
# 解析小说内容
def parse(text: str) -> (str, str):
title = ''
context = ''
soup = sp(text, 'lxml')
tag = soup.p.contents
title = ''.join(soup.find('font'))
context = ''.join([x for x in tag if len(x) > 5])
return (title, context)
# 线程入口
def MyPare(url, absPath: str):
title = ''
text = getHTMLText(url)
if text == None:
return url + ' Failed '
try:
title, content = parse(text)
try:
with open(absPath, 'w', encoding='utf-8')as f:
f.write(content)
except Exception as e:
print(title+' 写入文件失败 ', format(e))
except Exception as e:
print('Error : ' + url, format(e))
return title
def main(basePath='E:/Test/小说/', baseUrl='https://www.kanunu8.com/book3/6633/', timeout=3000):
with ThreadPoolExecutor(max_workers=8) as t:
executor = ThreadPoolExecutor(8)
futures = []
try:
os.mkdir(basePath)
except:
print('目录已存在或该磁盘不存在')
begin = time.time()
for title, url in getUrls(baseUrl, 30):
absPath = os.path.join(basePath, title+'.txt')
if os.path.exists(absPath):
print(absPath, "已下载...")
else:
print(absPath, " 开始下载: ", url)
futures.append(executor.submit(MyPare, url, absPath))
times = time.time() - begin
print("所有线程开始: ", times)
for f in as_completed(futures):
times = time.time() - begin
print(f.result(), " Done: ", times)
times = time.time() - begin
print("ALL Done: ", times)
if __name__ == '__main__':
# 小说保存路径
savePath = r'E:/Test/小说/超新星纪元'
# 小说目录页面
baseUrl = r'https://www.kanunu8.com/book3/6634/'
# requests超时
timeout = 30
main(savePath, baseUrl, time)
❤️我的目标是:someday,即便你花钱看我的文章,也会觉得心满意足