Python 多线程与协程爬取方案对比

最新推荐文章于 2024-01-04 23:18:11 发布

风老魔

最新推荐文章于 2024-01-04 23:18:11 发布

阅读量370

点赞数

分类专栏： Python 学习笔记文章标签： python 开发语言

本文链接：https://blog.csdn.net/hj1993/article/details/129232964

版权

Python 学习笔记专栏收录该内容

47 篇文章 0 订阅

订阅专栏

1. 线程

1.1 单线程

"""
案例名称：多线程测试
思路：
    单线程与多线程比较，比较其运行时间
"""

"""单线程"""

import time
import threading

loops = [4, 2]


def loop(nloop, nsec):
    print('开始：nloop: %s，time：%s' % (nloop, time.ctime()))
    time.sleep(nsec)
    print('结束：nloop: %s，time：%s' % (nloop, time.ctime()))


if __name__ == '__main__':
    start_time = time.time()
    print('loop 开始运行', time.ctime())
    for i in range(len(loops)):
        loop(i, loops[i])

    end_time = time.time()
    print('loop 结束运行', time.ctime())

    print(end_time - start_time)

运行结果：

loop 开始运行 Fri Oct 11 17:56:27 2019
开始：nloop: 0，time：Fri Oct 11 17:56:27 2019
结束：nloop: 0，time：Fri Oct 11 17:56:31 2019
开始：nloop: 1，time：Fri Oct 11 17:56:31 2019
结束：nloop: 1，time：Fri Oct 11 17:56:33 2019
loop 结束运行 Fri Oct 11 17:56:33 2019
6.001017093658447

1.2 多线程

import time
import threading

loops = [4, 2]


def loop(nloop, nsec):
    print('开始：nloop: %s，time：%s' % (nloop, time.ctime()))
    time.sleep(nsec)
    print('结束：nloop: %s，time：%s' % (nloop, time.ctime()))


def main():
    start_time = time.time()
    print('loop 开始运行', time.ctime())
    threads = []
    for i in range(len(loops)):
        t = threading.Thread(target=loop, args=(i, loops[i], ))
        threads.append(t)

    for i in range(len(loops)):
        threads[i].start()

    for i in range(len(loops)):
        threads[i].join()

    end_time = time.time()
    print('loop 结束运行', time.ctime())

    return end_time - start_time


if __name__ == '__main__':
    result = main()
    print(result)

运行结果：

loop 开始运行 Fri Oct 11 17:51:04 2019
开始：nloop: 0，time：Fri Oct 11 17:51:04 2019
开始：nloop: 1，time：Fri Oct 11 17:51:04 2019
结束：nloop: 1，time：Fri Oct 11 17:51:06 2019
结束：nloop: 0，time：Fri Oct 11 17:51:08 2019
loop 结束运行 Fri Oct 11 17:51:08 2019
4.002340078353882

1.3 类实例创建多线程

from time import time, ctime, sleep
import threading

loops = [4, 2]


class MyThread:
    def __init__(self, func, args, name=''):
        self.name = name
        self.func = func
        self.args = args

    def __call__(self, *args, **kwargs):
        self.func(*self.args)


def loop(nloop, nsec):
    print('开始：nloop: %s，time：%s' % (nloop, ctime()))
    sleep(nsec)
    print('结束：nloop: %s，time：%s' % (nloop, ctime()))


def main():
    start_time = time()
    print('loop 开始运行', ctime())
    threads = []
    for i in range(len(loops)):
        t = threading.Thread(target=MyThread(loop, (i, loops[i]), loop.__name__))
        threads.append(t)

    for i in range(len(loops)):
        threads[i].start()

    for i in range(len(loops)):
        threads[i].join()

    end_time = time()
    print('loop 结束运行', ctime())

    return end_time - start_time


if __name__ == '__main__':
    result = main()
    print(result)

运行结果：

loop 开始运行 Fri Oct 11 18:08:23 2019
开始：nloop: 0，time：Fri Oct 11 18:08:23 2019
结束：nloop: 0，time：Fri Oct 11 18:08:27 2019
开始：nloop: 1，time：Fri Oct 11 18:08:27 2019
结束：nloop: 1，time：Fri Oct 11 18:08:29 2019
loop 结束运行 Fri Oct 11 18:08:29 2019
6.0009613037109375

1.4 线程池 ThreadPoolExecutor

利用concurrent.futures.Future来进行各种便捷的数据交互，包括处理异常，都在result()中再次抛出。

import time
import threading
from concurrent import futures
from concurrent.futures import ThreadPoolExecutor


def task(n):
    time.sleep(0.5)
    print('线程：%s，当前数字：%s' % (threading.currentThread().getName(), n))

    return n * 2


def fetch1():
    """多线程，按任务发布顺序依次等待完成，有序"""
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_list = [executor.submit(task, i) for i in range(12)]

        for future in future_list:
            print(future.result())
            # print(future.exception())


def fetch2():
    """多线程，先完成先显示， 无序"""
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_list = [executor.submit(task, i) for i in range(12)]

        done_iter = futures.as_completed(future_list)  # generator

        for done in done_iter:
            print(done)


if __name__ == '__main__':
    # fetch1()
    fetch2()

4. grequests 库

实例化请求对象： grequests.request(method, url, **kwargs)
发起请求获得响应： grequests.map(requests, stream=False, size=None, exception_handler=None, gtimeout=None)
- size 参数可以控制并发的数量，一般最好是 50 -100

grequests.map 返回值属性：

>>> r = grequests.map(reqs, exception_handler=exception_handler)
>>> dir(r[0])
['__attrs__', '__bool__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_content', '_content_consumed', '_next', 'apparent_encoding', 'close', 'connection', 'content', 'cookies', 'elapsed', 'encoding', 'headers', 'history', 'is_permanent_redirect', 'is_redirect', 'iter_content', 'iter_lines', 'json', 'links', 'next', 'ok', 'raise_for_status', 'raw', 'reason', 'request', 'status_code', 'text', 'url']

发现我们能够用得到有：text、url、links、json、status_code、headers 等

import grequests
from time import ctime, time


def exception_handler(request, exception):
    """单个请求错误提示"""
    print('请求错误', request, exception)
    print(request.url)

    return [request.url]


def test1():
    """grequests 库"""
    urls = [
        'https://www.baidu.com',
        'https://www.baidu.com',
        'https://www.baidu.com',
        'https://www.baidu.com',
        'https://www.baidu.com',
        'https://www.baidu.com',
        'https://www.google.com',
    ]
    print('开始请求：', ctime())
    reqs = [grequests.get(url, timeout=5) for url in urls]
    r = grequests.map(reqs, exception_handler=exception_handler)

    print(r)
    for i in r:
        print(type(i), ctime())

    return r

if __name__ == '__main__':
    test1()

运行结果：

开始请求： Mon Oct 14 17:25:38 2019
请求错误 <grequests.AsyncRequest object at 0x000001CCAC84ACC0> HTTPSConnectionPool(host='www.google.com', port=443): Max retries exceeded with url: / (Caused by ConnectTimeoutError(<urllib3.connection.VerifiedHTTPSConnection object at 0x000001CCAC8EFE48>, 'Connection to www.google.com timed out. (connect timeout=5)'))
https://www.google.com
[<Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, <Response [200]>, ['https://www.google.com']]
<class 'requests.models.Response'> Mon Oct 14 17:25:43 2019
<class 'requests.models.Response'> Mon Oct 14 17:25:43 2019
<class 'requests.models.Response'> Mon Oct 14 17:25:43 2019
<class 'requests.models.Response'> Mon Oct 14 17:25:43 2019
<class 'requests.models.Response'> Mon Oct 14 17:25:43 2019
<class 'requests.models.Response'> Mon Oct 14 17:25:43 2019
<class 'list'> Mon Oct 14 17:25:43 2019

总结

grequests.map(reqs, exception_handler=exception_handler)：可以指定错误处理函数
在错误处理函数中可以通过 request.url 获取请求 URL
我们可以将有问题的 URL，返回，它会自动添加到 grequests.map() 返回值最后面

参考文章：https://blog.csdn.net/cong_da_da/article/details/84325849

5. aiohttp + asyncio 异步 http

5.1 快速开始

pip install aiohttp -i https://pypi.douban.com/simple

获取网页：

import aiohttp
import asyncio


async def fetch(session, url):
    async with session.get(url) as response:
        content = await response.text()
        return content


async def main():
    async with aiohttp.ClientSession() as session:
        # 使用协程的 await 关键字获取 fetch 的返回值
        html = await fetch(session, 'http://python.org')
        print(html)


if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())

响应对象 response 方法或属性

['ATTRS', '__aenter__', '__aexit__', '__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_body', '_cache', '_cleanup_writer', '_closed', '_connection', '_content_dict', '_content_type', '_continue', '_headers', '_history', '_loop', '_notify_content', '_parse_content_type', '_protocol', '_raw_headers', '_real_url', '_released', '_request_info', '_response_eof', '_session', '_source_traceback', '_stored_content_type', '_timer', '_traces', '_url', '_writer', 'charset', 'close', 'closed', 'connection', 'content', 'content_disposition', 'content_length', 'content_type', 'cookies', 'get_encoding', 'headers', 'history', 'host', 'json', 'links', 'method', 'raise_for_status', 'raw_headers', 'read', 'real_url', 'reason', 'release', 'request_info', 'start', 'status', 'text', 'url', 'url_obj', 'version', 'wait_for_close']

比较常用的有：

url：请求的 URL
status：响应 code，类似于 requests 库的 status_code
text()、json()：响应内容，json() 格式如果没有的话会报错

其他 HTTP 请求

session.post('http://httpbin.org/post', data=b'data')
session.put('http://httpbin.org/put', data=b'data')
session.delete('http://httpbin.org/delete')
session.head('http://httpbin.org/get')
session.options('http://httpbin.org/get')
session.patch('http://httpbin.org/patch', data=b'data')

Tips：当请求超时或者发送错误时，返回值将为 None，应全面考虑

5.2 使用 asyncio Task发送多个请求

import asyncio, aiohttp, time


async def fetch(session, url):
    async with session.get(url, timeout=30) as resp:
        # await 获取结果
        text = await resp.text()

        return resp.url


async def main(urls):
    content = {}
    async with aiohttp.ClientSession() as session:
        # 把所有请求放在 task 中
        tasks = [fetch(session, url) for url in urls]

        # 任务完成时，再处理任务结果，详情见 3.3.3
        for task in asyncio.as_completed(tasks):
            results = await task
            
            # url 是 <class 'yarl.URL'> 类型，需要 str 转换
            content[str(results)] = str(results)
            print('任务结果：%s，时间：%s' % (results, time.ctime()))

        return content

if __name__ == '__main__':
    urls = [
        'https://www.baidu.com',
        'https://www.douban.com',
        # 'https://www.google.com',
    ]
    loop = asyncio.get_event_loop()
    try:
        print('开始事件循环：', time.ctime())
        ret = loop.run_until_complete(main(urls))       # 获取协程结果
        print('执行结果：', ret)
    except Exception as e:
        print('发送错误', e)
    finally:
        loop.close()

执行结果：

开始事件循环： Thu Oct 17 10:49:57 2019
任务结果：https://www.baidu.com，时间：Thu Oct 17 10:49:57 2019
任务结果：https://www.douban.com，时间：Thu Oct 17 10:49:57 2019
执行结果： {'https://www.baidu.com': 'https://www.baidu.com', 'https://www.douban.com': 'https://www.douban.com'}

5.3 错误：`AssertionError: There is no current event loop in thread ‘Thread-1’`

原因：

asyncio 程序中的每个线程都有自己的事件循环，但它只会在主线程中为你自动创建一个事件循环。所以如果你asyncio.get_event_loop在主线程中调用一次，它将自动创建一个循环对象并将其设置为默认值，但是如果你在一个子线程中再次调用它，你会得到这个错误。相反，您需要在线程启动时显式创建/设置事件循环：

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

原文链接：https://blog.csdn.net/qq_34367804/article/details/75046718

场景

Django 请求下获取事件循环，调用相应函数异步请求 HTTP，出现以上错误：

1、Project\app\views.py

import asyncio

class CheckPeerView(APIView):
    def get(self, request, *args, **kwargs):
       
        return render(request, 'myadmin/acc_check_peer.html')

    def post(self, request, *args, **kwargs):
        
        # 调用相应接口
        results = self.func(loop)

        return HttpResponse(json.dumps(xxx))

    def func(self, loop):
        loop = asyncio.get_event_loop()
        # 启动事件循环
        results = loop.run_until_complete(async_http(urls))
        loop.close()                    # 关闭事件循环

        return results

2、Project\utils\common\request_handel.py

import asyncio
import aiohttp

async def async_http(urls):
    async with aiohttp.ClientSession() as session:
        tasks = [fetch(session, url) for url in urls]

        # 所有任务完成后，处理任务结果
        for task in asyncio.as_completed(tasks):
            content = await task
            if content:
                status, result, send_url = content
                if status == 200:
                   pass
            	else:
                    pass
            
    return xxx

async def fetch(session, url):
    try:
        async with session.get(url,  headers=reuturn_headers(), timeout=30) as resp:
            json_data = await resp.json()

            return resp.status, json_data, resp.url
    except Exception as e:
        pass

当有 post 请求过来时，调用 async_http() 方法，发生：AssertionError: There is no current event loop in thread ‘Thread-1’ 错误

解决办法：

class CheckPeerView(APIView):
    def get(self, request, *args, **kwargs):
       
        return render(request, 'myadmin/acc_check_peer.html')

    def post(self, request, *args, **kwargs):
        # 新建事件循环
        loop = asyncio.new_event_loop()			# 新增这两句
        asyncio.set_event_loop(loop)
        
        # 调用相应接口
        results = self.func(loop)

        return HttpResponse(json.dumps(peer_status))

    def func(self, loop):
        #loop = asyncio.get_event_loop()		# 注释这句
        
        # 启动事件循环
        results = loop.run_until_complete(async_http())
        loop.close()                    # 关闭事件循环

        return results

5.4 URL 上传递参数

params = {'key1': 'value1', 'key2': 'value2'}
async with session.get('http://httpbin.org/get',
                       params=params) as resp:
    expect = 'http://httpbin.org/get?key2=value2&key1=value1'
    assert str(resp.url) == expect

参考文章：https://aiohttp.readthedocs.io/en/stable/client_quickstart.html

5.6 示例二：gather 收集所有的 Future 对象

# coding: utf-8
import asyncio
import json
import os
import time
import sys
import pandas as pd

import aiohttp
import requests
from datetime import datetime


headers = {
    'Content-Type': "application/x-www-form-urlencoded",
    'cache-control': "no-cache",
    'Postman-Token': "d2d27edd-6795-45be-a1ef-d4cd749f79c4"
}

class Handle:
    def parser_args(self, command, excel_path):
        """
        参数解析
        :param command: 执行命令
        :param excel_path: excel 文件
        :return:
        """
        self.read_excel(command, excel_path)

    def read_excel(self, command, excel_path):
        """读取 excel"""
        df = pd.read_excel(excel_path)
        data = df.values
        mac_list = []

        for i in data:
            mac_list.append(i[0])

        self.build_async(mac_list, command)

    def build_async(self, mac_list, command):
        """
        构建协程
        :param mac_list: Mac 列表
        :param command: 命令
        :return:
        """
        print('异步任务')
        start_time = time.time()

        # 新建事件循环，协程
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self.async_http(mac_list, command))

        print('总耗时：', time.time() - start_time)

    async def async_http(self, mac_list, command):
        try:
            url = "http://xxxx"

            tasks = []		# 任务列表
            semaphore = asyncio.Semaphore(5)	# 控制并发量

            for mac in mac_list:
                # 添加到任务中
                tasks.append(self.fetch(url, command, mac, semaphore))

            # asyncio.gather(*tasks)，它搜集所有的Future对象，然后等待他们返回
            responses = await asyncio.gather(*tasks)
            result = self.result_handle(responses)
            print('>>>', result)

        except Exception as e:
            pass

    async def fetch(self, url, command, mac, semaphore):
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                async with session.post(url, data=data, headers=headers, timeout=30) as resp:
                    content = await resp.text()

                    return resp.status, content, mac

    def result_handle(self, responses):
        """
        处理协程结果
        :param responses: [(200, '{"code":0,"data":"bin\\n","timestamp":"1581911821","message":"success"}\n', 'D4EE076436A4'), ...]
        :return:
        """
        result = []
        online, offline, timeout = 0, 0, 0
        for i in responses:
            if i[0] == 200:
                resp = json.loads(i[1])
                if resp.get("code") == 108 or resp.get("message") == "device offline":
                    # print("该 Mac 离线 %s" % i[-1])
                    offline += 1
                elif resp.get("code") == 101:
                    # print("查询超时：%s" % i[-1])
                    timeout += 1
                else:
                    result.append({
                        i[-1]: resp.get("data")
                    })
                    online += 1

        return result, online, offline, timeout


if __name__ == '__main__':
    h = Handle()
    arg_list = sys.argv
    if len(arg_list) == 3:
        if not arg_list[-1].endswith('.xlsx'):
            sys.exit('请执行 excel 格式文件！')

        num = 1
        while True:
            print('第 %s 次测试' % num)
            h.parser_args(arg_list[1], arg_list[-1])
            # time.sleep(5)
            print('-' * 50)
            num += 1
            if num == 20:
                break
    else:
        sys.exit('参数数目错误！')

6. Python 实现 requests 请求失败重试机制

原理：设置一个 retries，每次发生异常时 retries 就减 1，并重新调用原函数请求，直至小于 0 为止：

import requests


def http_request(url, method, timeout=30, retries=5):
    try:
        resp = requests.request(method=method, url=url, timeout=timeout)
        print(resp.status_code)

    except Exception as e:
        print('e', e)
        if retries > 0:
            return http_request(url, 'get', timeout=30, retries=retries - 1)
        else:
            print('req failed')
            return None
    else:		# 无异常时，执行 else 中的语句
        return resp.status_code


if __name__ == '__main__':
    url = 'https://google.com'
    method = 'get'
    result = http_request(url=url, method=method)
    print('请求结果：', result)