1.首先我们先导入我们需要的包
import requests from lxml import etree import os from concurrent.futures import ThreadPoolExecutor #线程池
1.然后从一本书中获取各个章节的链接
url = 'https://www.biquge9.com/book/3808/' res = requests.get(url=url) t = etree.HTML(res.text) i = t.xpath('//div[@class="listmain"]/dl/dd/a/@href') o = i[:10] p = i[11:] b = t.xpath('//div[@class="listmain"]/dl/span/dd/a/@href') z = o + b + p ur = [] for v in z: ur.append('https://www.biquge9.com' + v)
3.定义一个函数从章节链接中获取内容
def download(url,file): re = requests.get(url=url) re.encoding = 'utf-8' if not os.path.exists(file): os.mkdir(file) tr = etree.HTML(re.text) name = tr.xpath('//div[@class="book reader"]//div[@class="content"]/h1/text()')[0] path = file + '/' + name + '.txt' fp = open(path, 'w', encoding='utf-8') q = tr.xpath('//*[@id="chaptercontent"]/text()') for nei in q: fp.write(nei + '\n') print(name + '爬取成功') fp.close()
总体思路就这样,然后创建线程池快速爬取小说
下面这是源码
import requests from lxml import etree import time import os from concurrent.futures import ThreadPoolExecutor def download(url,file): re = requests.get(url=url) re.encoding = 'utf-8' if not os.path.exists(file): os.mkdir(file) tr = etree.HTML(re.text) name = tr.xpath('//div[@class="book reader"]//div[@class="content"]/h1/text()')[0] path = file + '/' + name + '.txt' fp = open(path, 'w', encoding='utf-8') q = tr.xpath('//*[@id="chaptercontent"]/text()') for nei in q: fp.write(nei + '\n') print(name + '爬取成功') fp.close() if __name__ == '__main__': '''获取章节内容url''' print('新笔趣阁的网址为:https://www.biquge9.com') url = str(input('请输入新笔趣阁中书的网址链接:')) file = input('请输入文件夹的名字:') # url = 'https://www.biquge9.com/book/3808/' res = requests.get(url=url) t = etree.HTML(res.text) i = t.xpath('//div[@class="listmain"]/dl/dd/a/@href') o = i[:10] p = i[11:] b = t.xpath('//div[@class="listmain"]/dl/span/dd/a/@href') z = o + b + p ur = [] for v in z: ur.append('https://www.biquge9.com' + v) with ThreadPoolExecutor(50)as f: for url in ur: f.submit(download,url=url,file=file)