写了一个下载小说的爬虫,具体代码如下:
现在就是有一个问题,如何在不打乱章节顺序的情况下提高爬取的速度呢???
from urllib.parse import quote
import requests
from lxml import etree
import time
def run(start_url):
response = requests.get(start_url)
element = etree.HTML(response.content.decode("gbk"))
book_url_list = element.xpath('//div[@id="list"]/dl/dd/a/@href')
if not book_url_list:
trs = element.xpath('//tr[@id="nr"]')
if not trs:
print('抱歉!小说未找到。')
time.sleep(0.5)
return None
print("搜索到以下小说,选择你要下载小说的序号:")
print('\n')
for index,tr in enumerate(trs):
name = tr.xpath('./td[1]/a/text()')[0]
author = tr.xpath('./td[3]/text()')[0]
num = str(index+1)
print('小说序号:'+num+' 书名:'+name+' 作者:'+author)
print('\n')
while True:
num = input("搜索到以上小说,选择你要下载小说的序号:")
l = len(trs)
try:
index = int(num)-1
except:
print("不要调皮,认真输!填错了!!!")
continue
if 0 <= index < l:
url = trs[index].xpath('./td[1]/a/@href')[0]
response = requests.get(url)
element = etree.HTML(response.content.decode("gbk"))
book_url_list = element.xpath('//div[@id="list"]/dl/dd/a/@href')
break
book_name = element.xpath('//div[@id="info"]/h1/text()')[0]
f = open(book_name+'.txt','w',encoding='gbk')
for url in book_url_list:
url = 'http://www.biquge.com.tw' + url
response = requests.get(url)
element = etree.HTML(response.content.decode('gbk'))
book_title = element.xpath('//div[@class="bookname"]/h1/text()')[0]
book_content = [i.strip()for i in element.xpath('//div[@id="content"]//text()')]
print("正在下载:"+ book_title)
f.write(book_title)
f.write('\n')
f.write('\n')
for i in book_content:
f.write(i)
f.write('\n')
f.write('\n')
f.write('\n')
print('下载完成')
f.close()
return 200
if __name__ == '__main__':
while True:
name = input("请输入要下载的小说名字:")
name = quote(name,encoding='gbk')
start_url = 'http://www.biquge.com.tw/modules/article/soshu.php?searchkey={}'.format(name)
a = run(start_url)
if a is None:
continue
else:
time.sleep(3)
break