教程:【【2021最新版】Python 并发编程实战,用多线程、多进程、多协程加速程序运行】 https://www.bilibili.com/video/BV1bK411A7tV/?share_source=copy_web&vd_source=3c8dced09a6723bcd0d0926c6ac558f9
Thread\blog_spider.py
import requests
from bs4 import BeautifulSoup
urls = [
f"https://www.cnblogs.com/#p{page}"
for page in range(1, 50+1)
]
def craw(url):
r = requests.get(url)
return r.text
def parse(html):
soup = BeautifulSoup(html, 'html.parser')
links = soup.find_all('a', class_='post-item-title')
return [(link['href'], link.get_text()) for link in links]
# craw(urls[0])
if __name__ == '__main__':
for result in parse(craw(urls[2])):
print(result)
1. threading
multi_thread_craw.py
import Thread.blog_spider as blog_spider
import threading
import time
def single_thread():
for url in blog_spider.urls:
blog_spider.craw(url)
def multi_thread():
threads = []
for url in blog_spider.urls:
threads.append(
threading.Thread(target=blog_spider.craw, args=(url,))
)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ == '__main__':
start = time.time()
single_thread()
end = time.time()
print('single thread cost:', end - start, 'sec')
start = time.time()
multi_thread()
end = time.time()
print('single thread cost:', end - start, 'sec')
2. queue.Queue()
producer_consumer_spider.py
import queue
import Thread.blog_spider as blog_spider
import threading
import time
import random
def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
while True:
url = url_queue.get()
html = blog_spider.craw(url)
html_queue.put(html)
print(threading.current_thread().name, f" craw {url}",
"url_queue.size=", url_queue.qsize())
time.sleep(random.randint(1, 2))
def do_pare(html_queue: queue.Queue, fout):
while True:
html = html_queue.get()
results = blog_spider.parse(html)
for result in results:
fout.write(str(result) + '\n')
print(threading.current_thread().name, " results.size", len(results),
"html_queue.size=", html_queue.qsize())
time.sleep(random.randint(1, 2))
if __name__ == '__main__':
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in blog_spider.urls:
url_queue.put(url)
print(url_queue)
for idx in range(3):
t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
t.start()
fout = open("02.data.txt", "w")
for idx in range(2):
t = threading.Thread(target=do_pare, args=(html_queue, fout), name=f"parse{idx}")
t.start()
3. lock
lock_concurrent.py
import threading
import time
lock = threading.Lock()
class Account:
def __init__(self, blance):
self.blcance = blance
def draw(account, amount):
with lock:
if account.blcance >= amount:
time.sleep(0.1)
print(threading.current_thread().name, "取钱成功")
account.blcance -= amount
print("余额", account.blcance)
else:
print(threading.current_thread().name, "取钱失败")
print("余额", account.blcance)
if __name__ == '__main__':
account = Account(1000)
ta = threading.Thread(target=draw, args=(account, 800), name='ta')
tb = threading.Thread(target=draw, args=(account, 800), name='tb')
ta.start()
tb.start()
4. asyncio
import asyncio
import aiohttp
import Thread.blog_spider as blog_spider
import time
async def async_craw(url):
print("craw url:", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
print(f"craw url:{url}, {len(result)}")
loop = asyncio.get_event_loop()
tasks = [
loop.create_task(async_craw(url))
for url in blog_spider.urls
]
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("time", end - start, 'sec')
5. semaphore
import asyncio
import aiohttp
import Thread.blog_spider as blog_spider
import time
semaphore = asyncio.Semaphore(10)
async def async_craw(url):
async with semaphore:
print("craw url:", url)
async with aiohttp.ClientSession() as session:
async with session.get(url) as resp:
result = await resp.text()
# await asyncio.sleep(5)
print(f"craw url:{url}, {len(result)}")
loop = asyncio.get_event_loop()
tasks = [
loop.create_task(async_craw(url))
for url in blog_spider.urls
]
start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("time", end - start, 'sec')