Python Thread code sample

最新推荐文章于 2024-09-14 19:55:48 发布

SR_SSS

最新推荐文章于 2024-09-14 19:55:48 发布

阅读量109

点赞数

分类专栏： NOTE 文章标签： python 开发语言

本文链接：https://blog.csdn.net/SR_SSS/article/details/127858255

版权

NOTE 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

教程：【【2021最新版】Python 并发编程实战，用多线程、多进程、多协程加速程序运行】 https://www.bilibili.com/video/BV1bK411A7tV/?share_source=copy_web&vd_source=3c8dced09a6723bcd0d0926c6ac558f9

Thread\blog_spider.py

import requests
from bs4 import BeautifulSoup

urls = [
        f"https://www.cnblogs.com/#p{page}"
       for page in range(1, 50+1)
    ]


def craw(url):
    r = requests.get(url)
    return r.text


def parse(html):
    soup = BeautifulSoup(html, 'html.parser')
    links = soup.find_all('a', class_='post-item-title')
    return [(link['href'], link.get_text()) for link in links]


# craw(urls[0])
if __name__ == '__main__':
    for result in parse(craw(urls[2])):
        print(result)

1. threading

multi_thread_craw.py

import Thread.blog_spider as blog_spider
import threading
import time


def single_thread():
    for url in blog_spider.urls:
        blog_spider.craw(url)


def multi_thread():
    threads = []
    for url in blog_spider.urls:
        threads.append(
            threading.Thread(target=blog_spider.craw, args=(url,))
        )
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

if __name__ == '__main__':
    start = time.time()
    single_thread()
    end = time.time()
    print('single thread cost:', end - start, 'sec')

    start = time.time()
    multi_thread()
    end = time.time()
    print('single thread cost:', end - start, 'sec')

2. queue.Queue()

producer_consumer_spider.py

import queue
import Thread.blog_spider as blog_spider
import threading
import time
import random


def do_craw(url_queue: queue.Queue, html_queue: queue.Queue):
    while True:
        url = url_queue.get()
        html = blog_spider.craw(url)
        html_queue.put(html)
        print(threading.current_thread().name, f" craw {url}",
              "url_queue.size=", url_queue.qsize())
        time.sleep(random.randint(1, 2))


def do_pare(html_queue: queue.Queue, fout):
    while True:
        html = html_queue.get()
        results = blog_spider.parse(html)
        for result in results:
            fout.write(str(result) + '\n')
        print(threading.current_thread().name, " results.size", len(results),
              "html_queue.size=", html_queue.qsize())
        time.sleep(random.randint(1, 2))


if __name__ == '__main__':
    url_queue = queue.Queue()
    html_queue = queue.Queue()
    for url in blog_spider.urls:
        url_queue.put(url)

    print(url_queue)
    for idx in range(3):
        t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
        t.start()

    fout = open("02.data.txt", "w")
    for idx in range(2):
        t = threading.Thread(target=do_pare, args=(html_queue, fout), name=f"parse{idx}")
        t.start()

3. lock

lock_concurrent.py

import threading
import time
lock = threading.Lock()


class Account:
    def __init__(self, blance):
        self.blcance = blance


def draw(account, amount):
    with lock:
        if account.blcance >= amount:
            time.sleep(0.1)
            print(threading.current_thread().name, "取钱成功")
            account.blcance -= amount
            print("余额", account.blcance)
        else:
            print(threading.current_thread().name, "取钱失败")
            print("余额", account.blcance)


if __name__ == '__main__':
    account = Account(1000)
    ta = threading.Thread(target=draw, args=(account, 800), name='ta')
    tb = threading.Thread(target=draw, args=(account, 800), name='tb')

    ta.start()
    tb.start()

4. asyncio

import asyncio
import aiohttp
import Thread.blog_spider as blog_spider
import time

async def async_craw(url):
    print("craw url:", url)
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as resp:
            result = await resp.text()
            print(f"craw url:{url}, {len(result)}")

loop = asyncio.get_event_loop()

tasks = [
    loop.create_task(async_craw(url))
    for url in blog_spider.urls
]

start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("time", end - start, 'sec')

5. semaphore

import asyncio
import aiohttp
import Thread.blog_spider as blog_spider
import time


semaphore = asyncio.Semaphore(10)


async def async_craw(url):
    async with semaphore:
        print("craw url:", url)
        async with aiohttp.ClientSession() as session:
            async with session.get(url) as resp:
                result = await resp.text()
                # await asyncio.sleep(5)
                print(f"craw url:{url}, {len(result)}")

loop = asyncio.get_event_loop()

tasks = [
    loop.create_task(async_craw(url))
    for url in blog_spider.urls
]

start = time.time()
loop.run_until_complete(asyncio.wait(tasks))
end = time.time()
print("time", end - start, 'sec')