import requests import threading import queue from lxml import etree import time Q = queue.Queue() class A(threading.Thread): def __init__(self): threading.Thread.__init__(self) self.url = 'http://www.17k.com/list/2926161.html' def run(self): resp = requests.get(self.url) html = resp.content.decode('utf-8') text = etree.HTML(html) dds = text.xpath('//div[@class="Main List"]/dl[@class="Volume"]/dd/a/@href') for url in dds: url = 'http://www.17k.com' + url Q.put(url) class B(threading.Thread): def __init__(self): threading.Thread.__init__(self) def run(self): while True: url = Q.get() resp = requests.get(url) html = resp.content.decode('utf-8') text = etree.HTML(html) name = text.xpath('//div[@class="readAreaBox content"]/h1/text()')[0].strip() # 章节的名字 contents = text.xpath('//div[@class="readAreaBox content"]/div[@class="p"]/text()') f = open('./%s.txt' % name, 'w') print('正在保存%s' % name) for content in contents: f.write(content) # content是一段一段的文字,不是一个整体的,若是使用with open只能保存第一句 f.write('\n') f.close() if __name__ == '__main__': start = time.time() s = A() q = B() s.start() q.start() s.join() q.join() print(time.time()-start)
史上最简单的多线程爬小说
最新推荐文章于 2024-01-02 14:56:05 发布