scrapy middleware 中间件使用 asyncio

FOAF-lambda

已于 2024-02-03 12:19:02 修改

阅读量394

点赞数 6

文章标签： scrapy 中间件 python

于 2024-02-02 10:39:45 首次发布

本文链接：https://blog.csdn.net/lwdfzr/article/details/135988730

版权

import json
from scrapy import signals
import asyncio
from curl_cffi.requests import AsyncSession
from scrapy.http import HtmlResponse
# import requests_go
# from curl_cffi import requests as curl_cffi_request
# import cloudscraper
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
import sys
if sys.platform == 'win32':
    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
import nest_asyncio
nest_asyncio.apply()


class CurlAsyncDownloaderMiddleware:
    def __init__(self, max_concurrency=10):
        self.max_concurrency = max_concurrency
        self.timeout = settings.getfloat('DOWNLOAD_TIMEOUT')

    @classmethod
    def from_crawler(cls, crawler):
        return cls(
            max_concurrency=crawler.settings.getint('CONCURRENT_REQUESTS')
        )

    def get_proxy(self):
        if settings.get('PROXIES'):
            proxies = settings['PROXIES']
        else:
            proxies = {'http': "http://host:port", 'https': "http://host:port"}
        return proxies

    async def _fetch(self, request, spider):
        # 根据request.url 和 spider.name 判断请求方式，get, post
        headers = request.meta['headers'] # 可以从meta传headers参数
        #headers = request.headers.to_unicode_dict()#从yield scrapy.Request传headers参数
        flag = False
        async with AsyncSession() as session:
            for i in range(10):
                try:
                    proxies = self.get_proxy()
                    cookies = {}
                    # 读取cookies
                    if hasattr(spider, 'redis_server'):
                        redis_server = spider.redis_server
                        redis_cookie_key = spider.redis_cookie_key
                        cookies0 = redis_server.get(redis_cookie_key)
                        cookies = json.loads(cookies0) if cookies0 else {}
                    params = request.meta.get('params')
                    print('params:', params)
                    if cookies:
                        if params:
                            response = await session.get(request.url, impersonate="chrome110", headers=headers, params=params, cookies=cookies, proxies=proxies, timeout=self.timeout)
                        else:
                            response = await session.get(request.url, impersonate="chrome110", headers=headers, cookies=cookies, proxies=proxies, timeout=self.timeout)
                    else:
                        if params:
                            response = await session.get(request.url, impersonate="chrome110", headers=headers, params=params, proxies=proxies, timeout=self.timeout)
                        else:
                            response = await session.get(request.url, impersonate="chrome110", headers=headers, proxies=proxies, timeout=self.timeout)
                    print('status_code:', response.status_code)
                    flag = True
                    return HtmlResponse(
                        url=request.url,
                        status=response.status_code,
                        body=response.content,
                        request=request,
                        encoding=response.encoding
                    )
                except Exception as e:
                    print(e)
        if not flag:
            return HtmlResponse(url=request.url, status=500, request=request)

    def process_request(self, request, spider):
        print('spider_name1:', spider.name)
        #print(request.headers.to_unicode_dict()) 
        loop = asyncio.get_event_loop()
        get_future1 = asyncio.ensure_future(self._fetch(request, spider))
        loop.run_until_complete(get_future1)
        # print(get_future1.result().text)
        return get_future1.result()

    def process_response(self, request, response, spider):
        print('response_status:', response.status)
        # if response.status == 400:
        #     print(dir(response))
        #     print('process_response', response.text)
        #     print(response.request.headers)
        return response

    def process_exception(self, request, exception, spider):
        # 处理异常
        pass

# scrapy 重试

from scrapy.downloadermiddlewares.retry import RetryMiddleware, response_status_message
from twisted.internet import defer
from twisted.internet.error import TimeoutError, DNSLookupError, \
    ConnectionRefusedError, ConnectionDone, ConnectError, \
    ConnectionLost, TCPTimedOutError
from urllib3.exceptions import ProtocolError, ProxyError, ProxySchemeUnknown
from twisted.web.client import ResponseFailed
from scrapy.core.downloader.handlers.http11 import TunnelError
import logging
import time
import random

class MyRetryMiddleware(RetryMiddleware):
    logger = logging.getLogger(__name__)
    EXCEPTIONS_TO_RETRY = (defer.TimeoutError, TimeoutError, DNSLookupError,
                           ConnectionRefusedError, ConnectionDone, ConnectError,
                           ConnectionLost, TCPTimedOutError, ResponseFailed,
                           IOError, TunnelError, ProtocolError, ProxyError, ProxySchemeUnknown)
    
    def get_proxy(self):
        http_proxy = 'http://127.0.0.1:8090'
        return http_proxy
        
    def process_response(self, request, response, spider):
        # if request.meta.get('dont_retry', False): # 不需要重试，返回response
        #     return response
        proxy_ip = request.meta.get('proxy')
        if proxy_ip and not response.body: # 更换代理
            request.meta['proxy'] = self.get_proxy()
            return request
        if response.status in self.retry_http_codes:
            reason = response_status_message(response.status)
            time.sleep(random.randint(3, 5))
            logging.info('返回值异常, 更换代理IP进行重试...')
            return self._retry(request, reason, spider) or response
        return response

    def process_exception(self, request, exception, spider):
        if isinstance(exception, self.EXCEPTIONS_TO_RETRY):
            time.sleep(random.randint(3, 5))
            logging.warning('连接异常, 进行重试...')
            request.meta['proxy'] = self.get_proxy()
            return self._retry(request, exception, spider)