Python爬虫：通信和线程池进程池

最新推荐文章于 2024-07-27 12:20:46 发布

咕咕嘎嘎77

最新推荐文章于 2024-07-27 12:20:46 发布

阅读量178

点赞数

分类专栏： Python爬虫文章标签： python 爬虫网络爬虫数据分析

本文链接：https://blog.csdn.net/m0_57864180/article/details/123695903

版权

Python爬虫专栏收录该内容

8 篇文章 0 订阅

订阅专栏

通信和线程池进程池

线程间通信

导入线程队列

from queue import Queue
import time
from random import randint

同一个进程中的多个线程可以直接通信（一个线程可以直接使用另外一个线程中产生的数据）
通信原则：使用全局变量

from threading import Thread, current_thread
from queue import Queue
import time
from random import randint

# ======================案例1=============================
def sum1(x, y):
    # z是第一个子线程定义的全局变量
    global z
    z = x + y
    print(current_thread(), a)


def func2():
    print(current_thread(), z, a)


if __name__ == '__main__':
    # a是在主线程中定义的全局变量
    a = 100

    t1 = Thread(target=sum1, args=(10, 20))
    t1.start()

    t1.join()

    t2 = Thread(target=func2)
    t2.start()

    print('主线程：', current_thread(), z)


# ======================案例2  -  收集其他线程产生的数据 ======================
def download(name):
    # print(f'{name}数据')
    time.sleep(randint(2, 7))
    all_data.append(f'{name}数据')


if __name__ == '__main__':
    # 定义全局列表
    all_data = []

    # 创建子线程同时下载多个电影数据
    names = ['电影1', '电影2', '电影3', '电影4']
    ts = []
    for name in names:
        t = Thread(target=download, args=(name,))
        t.start()
        ts.append(t)

    # 等到所有电影都下载结束后，在主线程中处理下载得到的数据
    for t in ts:
        t.join()

    print(all_data)


# ======================案例3 - 使用线程队列 ==========================
def download(name):
    # print(f'{name}数据')
    time.sleep(randint(3, 7))
    # 1)添加数据
    q.put(f'{name}数据')


if __name__ == '__main__':
    # 使用队列
    # 1）创建队列对象
    q = Queue()

    # 创建子线程同时下载多个电影数据
    names = ['电影1', '电影2', '电影3', '电影4']
    for name in names:
        t = Thread(target=download, args=(name,))
        t.start()

    # 3). 获取队列中的数据
    # 队列的get操作有等待的功能：如果在执行get的时候队列为空，代码不会报错，而是停留在当前位置，直到队列不为空或者超时为止
    for _ in range(4):
        print(q.get(timeout=8))

进程间通信

from multiprocessing import Process, Queue, current_process   # 这个队列是进程队列，支持多进程通信
from threading import Thread
# from queue import Queue    # 这个是线程队列，不能进行进程间通信


# 1. 进程间通信
# 不同进程中的数据无法直接共享，如果进程间想要通信(数据传递)必须使用进程队列
def func1(queue):
    # print('进程1', x)     # # 报错!
    # global y
    # y = 200
    # all_data.append(100)
    print('func1:', current_process())
    queue.put(100)


def func2(queue):
    # print('进程2', x)       # 报错!
    # global z
    # z = 300
    # all_data.append(200)
    print('func2:', current_process())
    queue.put(200)


def func3():
    print('func3:', current_process())
    q.put(300)


# 2. 进程队列的使用方法
if __name__ == '__main__':
    x = 100
    all_data = []

    # 1）创建全局进程队列对象
    q = Queue()
    print('创建队列:', current_process())

    # 2) 将队列对象作为参数传递到进程中
    # 如果使用队列的进程和创建队列的进程不一致就必须通过参数传递对列，但是如果是同一个进程直接使用
    p1 = Process(target=func1, args=(q,))
    p2 = Process(target=func2, args=(q,))
    p1.start()
    p2.start()

    # 线程对象在哪个进程中创建，线程就属于哪个进程
    t1 = Thread(target=func3)
    t1.start()

    # p1.join()
    # p2.join()
    # print(all_data)

    # 3)在任意进程中使用队列获取数据
    print(q.get(timeout=2))
    print(q.get(timeout=2))
    print(q.get(timeout=2))

多进程爬京东商城

from selenium.webdriver import Chrome, ChromeOptions
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from multiprocessing import Process, Queue
from bs4 import BeautifulSoup
import csv
from threading import Thread

options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})


def get_html(name, queue: Queue):
    """
        获取指定商品的前10页数据
        name - 商品名称
    """
    b = Chrome(options=options)
    b.implicitly_wait(5)
    b.get('https://www.jd.com')
    search = b.find_element_by_id('key')
    search.send_keys(name)
    search.send_keys(Keys.ENTER)

    for _ in range(3):
        wait = WebDriverWait(b, 5)
        wait.until(EC.presence_of_element_located((By.CLASS_NAME, 'gl-warp')))
        for _ in range(6):
            b.execute_script('window.scrollBy(0, 800)')
            time.sleep(1)
        queue.put(name + '+===+' + b.page_source)
        btn = b.find_element_by_class_name('pn-next')
        btn.click()


def save_data():
    # 主进程中取从子进程中得到的数据
    while True:
        result = q.get()  # type: str
        if result == 'end':
            break
        # 解析数据并且保存数据
        # 1)获取商品名和网页源代码
        name, html = result.split('+===+')
        # 2)解析数据
        soup = BeautifulSoup(html, 'lxml')
        all_li = soup.select('.gl-warp>li')
        goods = []
        for li in all_li:
            title = li.select_one('.p-name>a').attrs['title']
            price = li.select_one('.p-price i').text
            comment = li.select_one('.p-commit a').text
            shop = li.select_one('.p-shop a').text
            goods.append([title, price, comment, shop])
        # 3)写入文件
        # print(name, goods)
        if name == '运动鞋':
            writer1.writerows(goods)
        else:
            writer2.writerows(goods)
        print('一页数据写入成功!')


if __name__ == '__main__':
    # 创建队列获取两个子进程中获取到的网页数据
    q = Queue()
    writer1 = csv.writer(open('files/运动鞋.csv', 'w', encoding='utf-8', newline=''))
    writer1.writerow(['商品', '价格', '评论数', '店铺'])
    writer2 = csv.writer(open('files/笔记本电脑.csv', 'w', encoding='utf-8', newline=''))
    writer2.writerow(['商品', '价格', '评论数', '店铺'])

    # 创建主进程的子线程解析数据
    t = Thread(target=save_data)
    t.start()

    # 在两个子进程中操作网页获取网页数据
    p1 = Process(target=get_html, args=('运动鞋', q))
    p2 = Process(target=get_html, args=('笔记本电脑', q))
    p1.start()
    p2.start()
    p1.join()
    p2.join()
    q.put('end')

队列如何正确结束

from queue import Queue
import time
from random import randint
from threading import Thread


def dowload(name):
    time.sleep(randint(1, 10))
    # 子线程获取到一个数据就往队列中添加一个数据
    q.put(f'{name}数据')


def get_data():
    while True:
        data = q.get()
        if data == 'end':
            break
        print(data)


if __name__ == '__main__':
    # 1. 创建队列，并且创建一个子线程获取队列中的数据
    q = Queue()
    t2 = Thread(target=get_data)
    t2.start()

    # 2. 创建多个线程同时获取多个数据
    ts = []
    # range(N)  - 0 ~ N-1
    for x in range(randint(30, 50)):
        t = Thread(target=dowload, args=(f'电影{x}',))
        t.start()
        ts.append(t)

    # 3. 等到所有的任务都完成，把队列结束标志添加到队列中
    for t in ts:
        t.join()

    q.put('end')

线程池

导入线程池(线程池执行者)对应的类

from concurrent.futures import ThreadPoolExecutor

一个线程池中可以有多个线程，并且可以添加多个任务(任务的数量可以比线程的数量多),线程池会自动给线程池的线程分配任务，直到所有的任务都完成。
使用线程池：
- 创建线程池: ThreadPoolExecutor(线程数)
- 添加任务
  - 任务一个一个的添加到线程池中: 线程池对应.submit(任务对应的函数, 实参1, 实参2, 实参3,…)
    
    任务对应的函数只有一个参数
  - 同时添加多个任务 - 任务对应的函数有且只有一个参数
    
    线程池对象.map(函数, 实参对应的序列)
- 关闭线程池并且等待线程池的任务结束
  
  关闭线程池指的停止往线程池中添加任务

import time
from random import randint
from threading import Thread, current_thread
from concurrent.futures import ThreadPoolExecutor

def download(name):
    time.sleep(randint(2, 6))
    print(f'{name}数据', current_thread())


def func1(x, y):
    print(x, y)


if __name__ == '__main__':
    names = [f'电影{x}' for x in range(50)]

    # 方法1：直接使用多线程下载50个电影
    # for x in names:
    #     t = Thread(target=download, args=(x, ))
    #     t.start()

    # 方法2：使用线程池
    # 1. 创建线程池: ThreadPoolExecutor(线程数)
    pool = ThreadPoolExecutor(50)

    # 2. 添加任务
    # 1)任务一个一个的添加到线程池中: 线程池对应.submit(任务对应的函数, 实参1, 实参2, 实参3,...)
    # 任务对应的函数只有一个参数

    # pool.submit(download, '肖生克的救赎')
    # pool.submit(func1, 100, 200)

    # for x in names:
    #     pool.submit(download, x)

    # 2)同时添加多个任务  -  任务对应的函数有且只有一个参数
    # 线程池对象.map(函数, 实参对应的序列)
    pool.map(download, names)

    # 线程池关闭前可以随时添加任务
    pool.submit(download, '肖生克的救赎')
    pool.submit(func1, 100, 200)

    # 3. 关闭线程池并且等待线程池的任务结束
    # 关闭线程池指的停止往线程池中添加任务
    pool.shutdown()

    # 不能线程池关闭以后添加任务
    # pool.submit(func1, 400, 300)      # 报错！

    print('==============================')

进程池

导入进程池类

from multiprocessing import Pool, current_process

使用进程池

from multiprocessing import Pool, current_process
import time
from random import randint


def download(name):
    time.sleep(randint(2, 6))
    print(f'{name}数据', current_process())


def func1(x, y):
    print(x, y)


if __name__ == '__main__':
    # 1. 创建进程池对象
    pool = Pool(5)

    # 2. 添加任务
    # 1) 一次添加一个任务
    # pool.apply_async(download, args=('肖申克的救赎',))
    # pool.apply_async(func1, args=(100, 200))
    # for x in range(10):
    #     pool.apply_async(download, args=(f'电影{x}',))

    # 2) 同时添加多个任务
    pool.map_async(download, ['电影1', '电影2', '电影3'])

    # 3. 关闭进程池(通过async方式添加的任务，必须在任务添加结束后依次执行close和join操作，任务才会启动)
    pool.close()
    pool.join()

咕咕嘎嘎77

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python爬虫：通信和线程池进程池

通信和线程池进程池线程间通信导入线程队列from queue import Queueimport timefrom random import randint同一个进程中的多个线程可以直接通信（一个线程可以直接使用另外一个线程中产生的数据）通信原则：使用全局变量from threading import Thread, current_threadfrom queue import Queueimport timefrom random import randint#
复制链接

扫一扫

专栏目录