之前的教程里有写过python爬取小说的代码,但如果小说字数很多,爬起来会比较耗时。此时需要结合多线程来加快爬取速度。
代码如下:
import requests
from lxml import etree
import threading
#lock
lock = threading.Lock()
def downloads():
url = 'https://www.******.net/daomu/guichuideng' #获取链接
#伪装
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
}
lock.acquire() #进程加锁
#发送请求
resp = requests.get(url, headers=headers)
resp.encoding = 'utf-8'
#print(resp.text)
#信息筛选
e = etree.HTML(resp.text)
links = e.xpath('//ul/li/a/@href') #页面所有章节的链接
for i in range(len(links)):
resp2 = requests.get(links[i], headers=headers) #分别对每一章进行下载
resp2.encoding = 'utf-8'
e2 = etree.HTML(resp2.text)
volume = e2.xpath('//div[@class="info"]/a/text()')[0] #每一卷的名字
print(volume)
title = e2.xpath('//h1/text()')[0] #每章标题
info = e2.xpath('//div[@class="neirong"]/p/text()') #每章内容
print(title)
info_2 = '\n'.join(info) #info为列表,转为str字符串
with open('鬼吹灯.txt', mode = 'a', encoding='utf-8') as f:
f.write('#'+volume+'\n'+'##'+title+'\n'+info_2+'\n\n')
lock.release()
if __name__ == '__main__':
download_1 = threading.Thread(target=downloads)
download_1.start()
通过在calibre里设置目录级别:一个#的对应h1级, 2个#对应h2级,再添加一个封面,就可以得到一本有目录可以自动跳转的电子书了