一、爬取流程分析
如下图所示,以书趣阁小说网的其中一篇小说《斗破之无上之境》为例,目标是爬取该小说的所有章节内容,并把内容存储到一个txt文件中。
首先,打开浏览器的开发者工具,刷新页面进行抓包,抓到了如下数据包:
我们需要的数据是每个章节的名称和详情页url,可以发现数据均在第一个数据包中(40247/)。接下来,使用requests模块向这个数据包的地址发送请求,携带请求头headers,经测试,最好再多携带一个cookie参数。然后使用xpath表达式解析数据,关键代码如下:
r = requests.get(index_url, headers=headers)
# print(r.text)
html = etree.HTML(r.text)
links = html.xpath('//div[@id="list"]/dl/dd/a/@href')
titles = html.xpath('//div[@id="list"]/dl/dd/a/text()')
然后将获取到的links遍历,拼接成完整的章节页url,逐个放入q队列中,关键代码如下:
for link in links:
link = "https://www.tjggzz.com" + link
q.put(link)
定义一个downloads函数,将q队列中的章节页url逐个取出,并向url发送requests的get请求,获取到数据之后用xpath表达式解析数据,存入txt文件中。在解析数据时,需要对获取到的数据做响应的处理,代码如下:
def downloads(q):
while True:
if q.empty() and gen_urls_done:
print('已完成全部下载')
break
else:
url = q.get()
r = requests.get(url, headers=headers)
html = etree.HTML(r.text)
content = html.xpath('//div[@id="htmlContent"]//text()')
content = "".join(content).strip().replace('\u3000\u3000', '\n')
title = html.xpath('//div[@class="bookname"]/h1/text()')[0].replace(':', '').replace('"', '')
with open(f"./novel/{title}.txt", "w", encoding="utf-8") as f:
f.write(title + '\n\n')
f.write(content + '\n\n')
print(f"{threading.current_thread().name}已完成...{title}的下载")
最后,定义一个combine_files函数,用于合并已经下载的各个章节的小说txt,代码如下:
def combine_files(path, novel_name):
while not file_list:
print('未发现file list,等待2秒')
time.sleep(2)
fp = open(f"./novel/{novel_name}.txt", "a", encoding='utf-8')
for filename in file_list:
while True:
filename = filename.replace(':', '').replace('"', '')
if os.path.exists(f'./novel/{filename}.txt'):
with open(f'./novel/{filename}.txt', 'r', encoding='utf=8') as f:
content = f.read()
fp.write(content)
print(f"已完成...{filename}的合并")
os.remove(f"./novel/{filename}.txt")
break
else:
print(f"未发现...{filename},等待10秒")
time.sleep(10)
fp.close()
print('已完成全部文件合并')
二、完整代码
多线程爬取书趣阁小说网小说的完整代码如下:
import threading
import time
import requests
from lxml import etree
from queue import Queue
import os
gen_urls_done = False
file_list = []
headers = {
'cookie': "PHPSESSID=8ig4u3u012qjkji9p2i3gqtd1i",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
}
def gen_urls(index_url, q, start, end):
global gen_urls_done
global file_list
r = requests.get(index_url, headers=headers)
# print(r.text)
html = etree.HTML(r.text)
links = html.xpath('//div[@id="list"]/dl/dd/a/@href')[int(start):int(end) + 1]
titles = html.xpath('//div[@id="list"]/dl/dd/a/text()')[int(start):int(end) + 1]
file_list = titles
for link in links:
link = "https://www.tjggzz.com" + link
q.put(link)
gen_urls_done = True
def downloads(q):
while True:
if q.empty() and gen_urls_done:
print('已完成全部下载')
break
else:
url = q.get()
r = requests.get(url, headers=headers)
html = etree.HTML(r.text)
content = html.xpath('//div[@id="htmlContent"]//text()')
content = "".join(content).strip().replace('\u3000\u3000', '\n')
title = html.xpath('//div[@class="bookname"]/h1/text()')[0].replace(':', '').replace('"', '')
with open(f"./novel/{title}.txt", "w", encoding="utf-8") as f:
f.write(title + '\n\n')
f.write(content + '\n\n')
print(f"{threading.current_thread().name}已完成...{title}的下载")
def combine_files(path, novel_name):
while not file_list:
print('未发现file list,等待2秒')
time.sleep(2)
fp = open(f"./novel/{novel_name}.txt", "a", encoding='utf-8')
for filename in file_list:
while True:
filename = filename.replace(':', '').replace('"', '')
if os.path.exists(f'./novel/{filename}.txt'):
with open(f'./novel/{filename}.txt', 'r', encoding='utf=8') as f:
content = f.read()
fp.write(content)
print(f"已完成...{filename}的合并")
os.remove(f"./novel/{filename}.txt")
break
else:
print(f"未发现...{filename},等待10秒")
time.sleep(10)
fp.close()
print('已完成全部文件合并')
def main():
index_url = 'https://www.tjggzz.com/html/40247/'
q = Queue(maxsize=10000)
start = input('请输入开始的章节数:')
end = input('请输入结束的章节数:')
th1 = threading.Thread(target=gen_urls, args=(index_url, q, start, end))
th1.start() # 启动
for i in range(3):
th2 = threading.Thread(target=downloads, args=(q,), name=f"线程{i}")
th2.start()
path = r'./novel'
novel_name = "斗破之无上之境"
th3 = threading.Thread(target=combine_files, args=(path, novel_name))
th3.start()
if __name__ == '__main__':
# 确保输出目录存在
if not os.path.exists('novel'):
os.makedirs('novel')
main()