python并发编程学习笔记--初识多线程 day01-CSDN博客

本文链接：https://blog.csdn.net/Carl_Shawshank/article/details/130632058

请求网页信息

blog_spider.py

import requests
# 生成列表
from bs4 import BeautifulSoup

urls = [
    # f"http://www.cnblogs.com/#p{page}"
    f"https://www.cnblogs.com/sitehome/p/{page}"
    for page in range(1,50+1)
]


def craw(url):
    r = requests.get(url)
    # print(url, len(r.text)
    return r.text


def parse(html):
    # class='post-item-title'
    # 定位html中a标签class为post-item-title的值
    soup = BeautifulSoup(html,'html.parser')
    links = soup.find_all("a", class_='post-item-title')
    return [(link['href'], link.get_text()) for link in links]


if __name__ == '__main__':
    for result in parse(craw(urls[2])):
        print(result)

# craw(urls[0])

multi_thread_craw.py

import threading
import time

import blog_spider


def single_thread():
    print('single_thread begin')
    for url in blog_spider.urls:
        blog_spider.craw(url)
    print('single_thread end')


def multi_thread():
    print('multi_thread begin')
    threads = []
    for url in blog_spider.urls:
        threads.append(threading.Thread(target=blog_spider.craw, args=(url,)))

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
    print('multi_thread end')


if __name__ == '__main__':
    start = time.time()
    single_thread()
    end = time.time()
    print("single thread cost:", end-start, 'seconds')

    start = time.time()
    multi_thread()
    end = time.time()
    print("multi thread cost:", end - start, 'seconds')

运行结果：

single_thread begin
single_thread end
single thread cost: 56.829020261764526 seconds
multi_thread begin
multi_thread end
multi thread cost: 1.360729455947876 seconds

可以看出多线程比单线程提升了约41.77倍。