主要练习了线程安全Lock,安全通信方案Queue,producer_customer模式
import os
import random
import threading
import requests as rq
import time
from threading import Thread, Lock
from queue import Queue # 用于多线程之间线程安全的数据通信
'''重要概念
I/O密集型任务
threading多线程不安全
GIL全局解释器锁
Lock
垃圾回收机制
线程池ThreadPoolExecutor
from threading import Thread, Lock
from queue import Queue
from concurrent.futures import ThreadPoolExecutor, as_completed
lock = Lock()#线程共享安全
q_url = Queue()#安全通信
q_html = Queue()
t = Thread(target=producer, args=(q_url, q_html))
t.start()
t.join()
with lock:
...
Queue.put() 默认是阻塞模式 block timeout
Queue.get() 默认是阻塞模式 block timeout
'''
url_list = [
f"http://www.cnblogs.com/#p/{page}" for page in range(1, 11)
]
def craw(url):
res = rq.get(url)
return len(res.text)
########################################
# Queue应用
# producer_customer模式
def producer(q_url: Queue, q_html: Queue):
while True:
url = q_url.get()
html = craw(url)
q_html.put(html)
time.sleep(random.randint(1, 2))
print("producer:", threading.currentThread().name, f"craw{url}", f"q_url.size={q_url.qsize()}")
def customer(q_html: Queue, filepath):
while True:
html = q_html.get()
print("customer:", threading.currentThread().name, f"q_html.size={q_html.qsize()}", html)
with lock:
with open(filepath, "a", encoding="utf") as f:
f.write(str(html) + "\n")
time.sleep(random.randint(1, 2))
if __name__ == '__main__':
filepath = "res_craw.txt"
with open(filepath, "w") as f:
f.write("")
lock = Lock() # 线程共享安全
q_url = Queue() # 安全通信
q_html = Queue()
for url in url_list: # 设置任务队列
q_url.put(url)
for index in range(5): # 生产线程
t = Thread(target=producer, args=(q_url, q_html))
t.start()
for index in range(5): # 消费线程
t = Thread(target=customer, args=(q_html, filepath))
t.start()