1. 小说网站为
https://m.wfxnews.com/
2. 分析网页结构
下载小说的API如下:
https://www.wfxnews.com/modules/article/txtarticle.php?id=112451
通过以下网址,可获得书籍信息
https://m.wfxnews.com/book/112451.shtml
112451为这本小说的ID。 ID最小为1,最大为199959。
遍历加多线程完成小说资源下载。
3. 源代码
# -*- coding:utf-8 -*-
import requests
import time
import os
from threading import Thread
def get_one_page(_url):
response = requests.get(_url)
response.raise_for_status()
response.encoding = 'gbk'
return response.text
def get_txt_save_path(_url):
idx = _url.rindex('=')
return 'txt/' + _url[idx + 1:] + '.txt'
def save_one_txt(_url):
txt_save_path = get_txt_save_path(_url)
# 下载TXT
one_txt = get_one_page(_url)
with open(txt_save_path, 'w') as f:
f.write(one_txt)
website = {
'last_id': 199959,
'first_id': 1,
'download_prefix': 'https://www.wfxnews.com/modules/article/txtarticle.php?id=',
'info_prefix': 'https://m.wfxnews.com/book/',
'info_suffix': '.shtml'
}
first_id = website['first_id']
last_id = website['last_id']
download_prefix = website['download_prefix']
cnt = 0
queue = []
for i in range(first_id, last_id + 1):
# 判断TXT是否已经下载
if os.path.exists('txt/' + str(i) + '.txt'):
continue
url = download_prefix + str(i)
try:
th = Thread(target=save_one_txt, args=(url,))
th.start()
queue.append(th)
cnt += 1
if cnt % 16 == 0:
for q in queue:
q.join()
queue = []
except:
print(url)
print('cnt =', cnt)
4. 运行结果