1. 代码实现 # -*- coding: utf-8 -*- import re from queue import Queue from threading import Thread import requests article_url = [] class Producer(Thread): """生产者""" def __init__(self): Thread.__init__(self) def run(self): while True: if not url_queue.empty(): try: url = url_queue.get() data = requests.get(url) html = data.content.decode('gbk') title = re.findall(r"<title>(.*?)</title>", html, re.S)[0] content = re.findall('<p>(.*?)</p>', html, re.S)[0] content = content.replace(' ', '') # 替换 网页空格符 print(title) content_queue.put({title: content}) except Exception as e: print(e) else: break class Consumer(Thread): """消费者""" def __init__(self): Thread.__init__(self) def run(self): while True: if content_queue.empty() and flag: break try: article = content_queue.get() title, content = list(article.keys())[0], list(article.values())[0] print('--------- 正在保存{} ---------'.format(title)) with open('./{}.txt'.format(title), mode='a') as f: f.write(content) except Exception as e: print(e) def get_url_list(url_queue): content = requests.get(base_url).content.decode('gbk') url_flg = re.findall(r'a href="2560/(.*?).html"', content, re.S) for url in url_flg: url_queue.put(base_url.replace('.html', '/{}.html').format(url)) return url_queue if __name__ == '__main__': base_url = 'https://www.kanunu8.com/files/old/2011/2560.html' # 1.创建url、章回内容线程池 url_queue = Queue() content_queue = Queue() # 获取url队列 url_queue = get_url_list(url_queue) # 2.创建生产者 crawl = [] for i in range(5): t = Producer() t.start() crawl.append(t) # 3. 创建消费者 flag = False for i in range(5): c = Consumer() c.start() # 4.保证p都做完了,将flag-->False # 阻塞在这里--监测p都是否完成。---join() a = [p.join() for p in crawl] print('-------- 生产者任务完成 --------') flag = True